# -*- coding: utf-8 -*-
"""
amazoncaptcha.solver
~~~~~~~~~~~~~~~~~~~~
This module contains AmazonCaptcha instance and all the requiries for it.
Attributes:
MONOWEIGHT (int): The bigger this number - the thicker a monochromed picture
MAXIMUM_LETTER_LENGTH (int): Maximum letter length by X axis
MINIMUM_LETTER_LENGTH (int): Minimum letter length by X axis
SUPPORTED_CONTENT_TYPES (list of str): Used when requesting a captcha url
to check if Content-Type in the headers is valid
"""
from .utils import cut_the_white, merge_horizontally, find_letter_boxes
from .exceptions import ContentTypeError
from PIL import Image, ImageChops
from io import BytesIO
import warnings
import requests
import json
import zlib
import os
try:
from selenium.webdriver.common.by import By
except:
By = None
#--------------------------------------------------------------------------------------------------------------
MONOWEIGHT = 1
MAXIMUM_LETTER_LENGTH = 33
MINIMUM_LETTER_LENGTH = 14
SUPPORTED_CONTENT_TYPES = ['image/jpeg']
#--------------------------------------------------------------------------------------------------------------
[docs]class AmazonCaptcha(object):
[docs] def __init__(self, img, image_link=None, devmode=False):
"""
Initializes the AmazonCaptcha instance.
Args:
img (str or io.BytesIO): Path to an input image OR an instance
of BytesIO representing this image.
image_link (str, optional): Used if `AmazonCaptcha` was created
using `fromdriver` class method. Defaults to None.
devmode (bool, optional): If set to True, instead of 'Not solved',
unrecognised letters will be replaced with dashes.
"""
self.img = Image.open(img, 'r')
self._image_link = image_link
self.devmode = devmode
self.letters = dict()
self.result = dict()
package_directory_path = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
self.training_data_folder = os.path.join(package_directory_path, 'training_data')
self.alphabet = [filename.split('.')[0] for filename in os.listdir(self.training_data_folder)]
@property
def image_link(self):
"""
Image link property is being assigned only if the instance was
created using `fromdriver` or `fromlink` class methods.
If you have created an AmazonCaptcha instance using the constructor,
the property will be equal to None which triggers the warning.
"""
if not self._image_link:
warnings.warn("Seems like you are trying to pull out the image link while not having it.", Warning, stacklevel=2)
return self._image_link
[docs] def _monochrome(self):
"""
Makes a captcha pure monochrome.
Literally says: "for each pixel of an image turn codes 0, 1 to a 0,
while everything in range from 2 to 255 should be replaced with 255".
*All the numbers stay for color codes.
"""
self.img = self.img.convert('L')
self.img = Image.eval(self.img, lambda a: 0 if a <= MONOWEIGHT else 255)
[docs] def _find_letters(self):
"""
Extracts letters from an image using found letter boxes.
Populates 'self.letters' with extracted letters being PIL.Image instances.
"""
letter_boxes = find_letter_boxes(self.img, MAXIMUM_LETTER_LENGTH)
letters = [self.img.crop((letter_box[0], 0, letter_box[1], self.img.height)) for letter_box in letter_boxes]
if (len(letters) == 6 and letters[0].width < MINIMUM_LETTER_LENGTH) or (len(letters) != 6 and len(letters) != 7):
letters = [Image.new('L', (200, 70)) for i in range(6)]
if len(letters) == 7:
letters[6] = merge_horizontally(letters[6], letters[0])
del letters[0]
letters = [cut_the_white(letter) for letter in letters]
self.letters = {str(k): v for k, v in zip(range(1, 7), letters)}
[docs] def _save_letters(self):
"""
Transforms separated letters into pseudo binary.
Populates 'self.letters' with pseudo binaries.
"""
for place, letter in self.letters.items():
letter_data = list(letter.getdata())
letter_data_string = ''.join(['1' if pix == 0 else '0' for pix in letter_data])
pseudo_binary = str(zlib.compress(letter_data_string.encode('utf-8')))
self.letters[place] = pseudo_binary
[docs] def _translate(self):
"""
Finds patterns to extracted pseudo binary strings from data folder.
Literally says: "for each pseudo binary scan every stored letter
pattern and find a match".
Returns:
str: a solution if there is one OR
'Not solved' if devmode set to False OR
a solution where unrecognised letters will be replaces with dashes
"""
for place, pseudo_binary in self.letters.items():
for letter in self.alphabet:
with open(os.path.join(self.training_data_folder, letter + '.json'), 'r', encoding = 'utf-8') as js:
data = json.loads(js.read())
if pseudo_binary in data:
self.result[place] = letter
break
else:
self.result[place] = '-'
if not self.devmode:
return 'Not solved'
return ''.join(self.result.values())
[docs] def solve(self, keep_logs=False, logs_path='not-solved-captcha.log'):
"""
Runs the sequence of solving a captcha.
Args:
keep_logs (bool): Not solved captchas will be logged if True.
Defaults to False.
logs_path (str): Path to the file where not solved captcha
links will be stored. Defaults to "not-solved-captcha.log".
Returns:
str: Result of the sequence.
"""
self._monochrome()
self._find_letters()
self._save_letters()
solution = self._translate()
if solution == 'Not solved' and keep_logs and self.image_link:
with open(logs_path, 'a', encoding='utf-8') as f:
f.write(self.image_link + '\n')
return solution
[docs] @classmethod
def fromdriver(cls, driver, devmode=False):
"""
Takes a screenshot from your webdriver, crops the captcha, and stores
it into bytes array, which is then used to create an AmazonCaptcha instance.
This also means avoiding any local savings.
Args:
driver (selenium.webdriver.*): Webdriver with opened captcha page.
devmode (bool, optional): If set to True, instead of 'Not solved',
unrecognised letters will be replaced with dashes.
Returns:
AmazonCaptcha: Instance created based on webdriver.
"""
png = driver.get_screenshot_as_png()
element = driver.find_element(By.TAG_NAME, 'img')
image_link = element.get_attribute('src')
location = element.location
size = element.size
left = location['x']
top = location['y']
right = location['x'] + size['width']
bottom = location['y'] + size['height']
img = Image.open(BytesIO(png))
img = img.crop((left, top, right, bottom))
bytes_array = BytesIO()
img.save(bytes_array, format='PNG')
image_bytes_array = BytesIO(bytes_array.getvalue())
return cls(image_bytes_array, image_link, devmode)
[docs] @classmethod
def fromlink(cls, image_link, devmode=False, timeout=120):
"""
Requests the given link and stores the content of the response
as `io.BytesIO` that is then used to create AmazonCaptcha instance.
This also means avoiding any local savings.
Args:
link (str): Link to Amazon's captcha image.
devmode (bool, optional): If set to True, instead of 'Not solved',
unrecognised letters will be replaced with dashes.
timeout (int, optional): Requests timeout.
Returns:
AmazonCaptcha: Instance created based on the image link.
Raises:
ContentTypeError: If response headers contain unsupported
content type.
"""
response = requests.get(image_link, timeout=timeout)
if response.headers['Content-Type'] not in SUPPORTED_CONTENT_TYPES:
raise ContentTypeError(response.headers['Content-Type'])
image_bytes_array = BytesIO(response.content)
return cls(image_bytes_array, image_link, devmode)
#--------------------------------------------------------------------------------------------------------------