image_downloader.py


import os
import time
import urllib
import requests
import progressbar
from PIL import Image
from io import BytesIO
from urllib.parse import quote

class simple_image_download:
    def __init__(self):
        pass

    def urls(self, keywords, limit, extensions={'.jpg', '.jpeg'}):
        keyword_to_search = [str(item).strip() for item in keywords.split(',')]
        i = 0
        links = []

        things = len(keyword_to_search) * limit

        bar = progressbar.ProgressBar(maxval=things, \
                                      widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()]).start()

        while i < len(keyword_to_search):
            #url = 'https://www.google.com/search?q=' + quote(
            #    keyword_to_search[i].encode(
            #        'utf-8')) + 'biw=1536&bih=674&tbm=isch&tbs=isz:m&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
            url = 'https://www.google.com/search?q=' + quote(
                keyword_to_search[i].encode(
                    'utf-8')) + '&tbm=isch&hl=en&tbs=isz:m&sa=X&ved=0CAIQpwVqFwoTCMDZwfbhqu0CFQAAAAAdAAAAABAD&biw=1803&bih=1041'
            raw_html = self._download_page(url)

            end_object = -1;
            google_image_seen = False;
            j = 0

            while j < limit:
                while (True):
                    try:
                        new_line = raw_html.find('"https://', end_object + 1)
                        end_object = raw_html.find('"', new_line + 1)

                        buffor = raw_html.find('\\', new_line + 1, end_object)
                        if buffor != -1:
                            object_raw = (raw_html[new_line + 1:buffor])
                        else:
                            object_raw = (raw_html[new_line + 1:end_object])

                        if any(extension in object_raw for extension in extensions):
                            break

                    except Exception as e:
                        break


                try:
                    r = requests.get(object_raw, allow_redirects=True, timeout=1)
                    if('html' not in str(r.content)):
                        # mime = magic.Magic(mime=True)
                        image = Image.open(BytesIO(r.content))
                        # print(image.format)
                        # file_type = mime.from_buffer(r.content)
                        # file_type = "image/jpg"
                        # file_extension = f'.{file_type.split("/")[1]}'
                        file_extension = "." + image.format.lower()
                        if file_extension == '.png' and not google_image_seen:
                            google_image_seen = True
                            raise ValueError();
                        links.append(object_raw)
                        bar.update(bar.currval + 1)
                    else:
                        j -= 1
                except Exception as e:
                    j -= 1
                j += 1

            i += 1

        bar.finish()
        return(links)


    def download(self, keywords, limit, extensions={'.jpg', '.jpeg'}):
        keyword_to_search = [str(item).strip() for item in keywords.split(',')]
        main_directory = "simple_images/"
        i = 0

        things = len(keyword_to_search) * limit

        bar = progressbar.ProgressBar(maxval=things, \
                                      widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

        bar.start()

        while i < len(keyword_to_search):
            self._create_directories(main_directory, keyword_to_search[i])
            #url = 'https://www.google.com/search?q=' + quote(
            #    keyword_to_search[i].encode('utf-8')) + '&biw=1536&bih=674&tbm=isch&tbs=isz:m&sxsrf=ACYBGNSXXpS6YmAKUiLKKBs6xWb4uUY5gA:1581168823770&source=lnms&sa=X&ved=0ahUKEwioj8jwiMLnAhW9AhAIHbXTBMMQ_AUI3QUoAQ'
            url = 'https://www.google.com/search?q=' + quote(
                keyword_to_search[i].encode(
                    'utf-8')) + '&tbm=isch&hl=en&tbs=isz:m&sa=X&ved=0CAIQpwVqFwoTCMDZwfbhqu0CFQAAAAAdAAAAABAD&biw=1803&bih=1041'
            raw_html = self._download_page(url)

            end_object = -1;
            google_image_seen = False;
            j = 0
            while j < limit:
                while (True):
                    try:
                        new_line = raw_html.find('"https://', end_object + 1)
                        end_object = raw_html.find('"', new_line + 1)

                        buffor = raw_html.find('\\', new_line + 1, end_object)
                        if buffor != -1:
                            object_raw = (raw_html[new_line+1:buffor])
                        else:
                            object_raw = (raw_html[new_line+1:end_object])

                        if any(extension in object_raw for extension in extensions):
                            break

                    except Exception as e:
                        break
                path = main_directory + keyword_to_search[i].replace(" ", "_")

                try:
                    r = requests.get(object_raw, allow_redirects=True, timeout=1)
                    if('html' not in str(r.content)):
                        #mime = magic.Magic(mime=True)
                        image = Image.open(BytesIO(r.content))
                        #print(image.format)
                        #file_type = mime.from_buffer(r.content)
                        #file_type = "image/jpg"
                        #file_extension = f'.{file_type.split("/")[1]}'
                        file_extension = "." + image.format.lower()
                        if file_extension not in extensions:
                            raise ValueError()
                        if file_extension == '.png' and not google_image_seen:
                            google_image_seen = True
                            raise ValueError()
                        file_name = str(keyword_to_search[i].replace(" ", "_")) + "_" + str(j + 1) + file_extension
                        with open(os.path.join(path, file_name), 'wb') as file:
                            file.write(r.content)
                        bar.update(bar.currval + 1)
                    else:
                        j -= 1
                except Exception as e:
                    j -= 1
                j += 1

            i += 1
        bar.finish()


    def _create_directories(self, main_directory, name):
        name = name.replace(" ", "_")
        try:
            if not os.path.exists(main_directory):
                os.makedirs(main_directory)
                time.sleep(0.2)
                path = (name)
                sub_directory = os.path.join(main_directory, path)
                if not os.path.exists(sub_directory):
                    os.makedirs(sub_directory)
            else:
                path = (name)
                sub_directory = os.path.join(main_directory, path)
                if not os.path.exists(sub_directory):
                    os.makedirs(sub_directory)

        except OSError as e:
            if e.errno != 17:
                raise
            pass
        return

    def _download_page(self,url):

        try:
            headers = {}
            headers['User-Agent'] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
            req = urllib.request.Request(url, headers=headers)
            resp = urllib.request.urlopen(req)
            respData = str(resp.read())
            return respData

        except Exception as e:
            print(e)
            exit(0)