Refactor

bohdanbobrowski · Sep 5, 2024 · 3b626a0 · 3b626a0
1 parent 7f83b27
commit 3b626a0
Show file tree

Hide file tree

Showing 10 changed files with 88 additions and 230 deletions.
diff --git a/blog2epub/blog2epub_gui.py b/blog2epub/blog2epub_gui.py
@@ -10,7 +10,7 @@
 from itertools import cycle
 from pathlib import Path
 from threading import Thread
-from typing import Optional, List
+from typing import List
 from urllib import parse
 
 from kivy.uix.anchorlayout import AnchorLayout  # type: ignore
@@ -398,14 +398,6 @@ def _get_url(self):
             return self.url_entry.text
         raise BadUrlException("Blog url is not valid.")
 
-    @staticmethod
-    def _is_int(value) -> Optional[int]:
-        try:
-            int(value)
-            return int(value)
-        except ValueError:
-            return None
-
     def _download_ebook(self, blog2epub: Blog2Epub):
         blog2epub.download()
         self._enable_download_button()
@@ -484,23 +476,20 @@ def download(self, instance):
         self.save_settings()
         configuration = ConfigurationModel(
             **{
-                "include_images": True,
-                "images_size": (600, 800),
-                "images_quality": 40,
-                "limit": self._is_int(self.limit_entry.text),
-                "skip": self._is_int(self.skip_entry.text),
+                "limit": self.limit_entry.text,
+                "skip": self.limit_entry.text,
+                "destination_folder": str(Path.home()),
             }
         )
         self.blog2epub = Blog2Epub(
             **{
-                "interface": self.interface,
                 "url": self._get_url(),
                 "configuration": configuration,
                 "start": None,
                 "end": None,
                 "file_name": None,
                 "cache_folder": os.path.join(str(Path.home()), ".blog2epub"),
-                "destination_folder": str(Path.home()),
+                "interface": self.interface,
             }
         )
         self.download_thread = Thread(

diff --git a/blog2epub/blog2epub_main.py b/blog2epub/blog2epub_main.py
@@ -1,5 +1,7 @@
+import logging
+
 from blog2epub.common.globals import VERSION
-from blog2epub.crawlers import BlogspotCrawler, WordpressCrawler
+from blog2epub.crawlers import BlogspotCrawler, WordpressCrawler, GenericCrawler
 
 
 class Blog2Epub:
@@ -10,7 +12,10 @@ class Blog2Epub:
     def __init__(self, **kwargs):
         if kwargs.get("url").find(".blogspot.") > -1:
             self.crawler = BlogspotCrawler(**kwargs)
-        self.crawler = WordpressCrawler(**kwargs)
+        if kwargs.get("url").find(".wordpress.com") > -1:
+            self.crawler = WordpressCrawler(**kwargs)
+        self.crawler = GenericCrawler(**kwargs)
+
 
     def download(self):
         self.crawler.crawl()
diff --git a/blog2epub/common/downloader.py b/blog2epub/common/downloader.py
@@ -5,14 +5,15 @@
 import re
 
 from http.cookiejar import CookieJar
-from typing import Optional
+from typing import Optional, List
 from urllib.parse import urlparse
 import time
 from PIL import Image
 
 import requests
 
 from blog2epub.models.book import DirModel
+from common.interfaces import EmptyInterface
 
 
 def prepare_directories(dirs: DirModel):
@@ -23,15 +24,23 @@ def prepare_directories(dirs: DirModel):
 
 
 class Downloader:
-    def __init__(self, crawler):
-        self.dirs = crawler.dirs
-        self.url_to_crawl = crawler.url_to_crawl
-        self.crawler_url = crawler.url
-        self.crawler_port = crawler.port
-        self.interface = crawler.interface
-        self.images_size = crawler.images_size
-        self.images_quality = crawler.images_quality
-        self.ignore_downloads = crawler.ignore_downloads
+    def __init__(
+        self,
+        dirs: DirModel,
+        url: str,
+        url_to_crawl: str,
+        interface: EmptyInterface,
+        images_size: List[int],
+        images_quality: int,
+        ignore_downloads: List[str],
+    ):
+        self.dirs = dirs
+        self.url = url
+        self.url_to_crawl = url_to_crawl
+        self.interface = interface
+        self.images_size = images_size
+        self.images_quality = images_quality
+        self.ignore_downloads = ignore_downloads
         self.cookies = CookieJar()
         self.session = requests.session()
         self.headers = {}
@@ -115,14 +124,14 @@ def get_content(self, url):
             interstitial = self.checkInterstitial(contents)
             if interstitial:
                 interstitial_url = (
-                    "http://" + self.crawler_url + "?interstitial=" + interstitial
+                    "http://" + self.url + "?interstitial=" + interstitial
                 )
                 self.file_download(
                     interstitial_url, self.get_filepath(interstitial_url)
                 )
                 contents = self.file_download(
-                    "http://" + self.crawler_url,
-                    self.get_filepath("http://" + self.crawler_url),
+                    "http://" + self.url,
+                    self.get_filepath("http://" + self.url),
                 )
         return contents
 

diff --git a/blog2epub/common/globals.py b/blog2epub/common/globals.py
@@ -1 +1 @@
-VERSION = "1.3.1"
+VERSION = "1.4.0"
diff --git a/blog2epub/crawlers/abstract.py b/blog2epub/crawlers/abstract.py
@@ -6,19 +6,11 @@
 from pathlib import Path
 from typing import List, Optional, Dict
 from xml import etree
-import gzip
-import hashlib
-import imghdr
 import re
-import time
-from http.cookiejar import CookieJar
-from urllib.parse import urlparse
 
-import requests
 from lxml.html.soupparser import fromstring
-from PIL import Image
 
-from blog2epub.common.downloader import prepare_directories
+from blog2epub.common.downloader import Downloader
 import dateutil
 
 from blog2epub.common.book import Book
@@ -33,8 +25,6 @@
 
 
 class AbstractCrawler(ABC):
-    ignore_downloads: List[str] = []
-
     def __init__(
         self,
         url: str,
@@ -67,10 +57,25 @@ def __init__(
         self.articles: List[Article] = []
         self.article_counter = 0
         self.images: List[str] = []
-        self.downloader = Downloader(self)
         self.tags: Dict = {}
         self.active = False
         self.cancelled = False
+        self.ignore_downloads: List[str] = []
+        self.article_class = "Article"
+        self.content_xpath = (
+            "//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
+        )
+        self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
+        self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
+        self.downloader = Downloader(
+            dirs=self.dirs,
+            url=self.url,
+            url_to_crawl=self.url_to_crawl,
+            interface=self.interface,
+            images_size=self.configuration.images_size,
+            images_quality=self.configuration.images_quality,
+            ignore_downloads=self.ignore_downloads,
+        )
 
     @abstractmethod
     def crawl(self):
@@ -333,149 +338,3 @@ def process(self):
             self.get_content()
             self.get_tags()
             self.get_comments()
-
-
-class Downloader:
-    def __init__(self, crawler):
-        self.dirs = crawler.dirs
-        self.url_to_crawl = crawler.url_to_crawl
-        self.crawler_url = crawler.url
-        self.crawler_port = crawler.port
-        self.interface = crawler.interface
-        self.images_size = crawler.images_size
-        self.images_quality = crawler.images_quality
-        self.ignore_downloads = crawler.ignore_downloads
-        self.cookies = CookieJar()
-        self.session = requests.session()
-        self.headers = {}
-
-    def get_urlhash(self, url):
-        m = hashlib.md5()
-        m.update(url.encode("utf-8"))
-        return m.hexdigest()
-
-    def file_write(self, contents, filepath):
-        filepath = filepath + ".gz"
-        with gzip.open(filepath, "wb") as f:
-            f.write(contents.encode("utf-8"))
-
-    def file_read(self, filepath):
-        if os.path.isfile(filepath + ".gz"):
-            with gzip.open(filepath + ".gz", "rb") as f:
-                contents = f.read().decode("utf-8")
-        else:
-            with open(filepath, "rb") as html_file:
-                contents = html_file.read().decode("utf-8")
-            self.file_write(contents, filepath)
-            os.remove(filepath)
-        return contents
-
-    def get_filepath(self, url):
-        return os.path.join(self.dirs.html, self.get_urlhash(url) + ".html")
-
-    def _is_url_in_ignored(self, url) -> bool:
-        for search_rule in self.ignore_downloads:
-            if re.match(search_rule, url):
-                return True
-        return False
-
-    def file_download(self, url: str, filepath: str) -> Optional[str]:
-        if self._is_url_in_ignored(url):
-            return None
-        prepare_directories(self.dirs)
-        try:
-            response = self.session.get(url, cookies=self.cookies, headers=self.headers)
-        except requests.exceptions.ConnectionError:
-            return None
-        self.cookies = response.cookies
-        data = response.content
-        contents = data.decode("utf-8")
-        self.file_write(contents, filepath)
-        return contents
-
-    def image_download(self, url: str, filepath: str) -> bool | None:
-        if self._is_url_in_ignored(url):
-            return None
-        prepare_directories(self.dirs)
-        try:
-            response = self.session.get(url, cookies=self.cookies, headers=self.headers)
-        except requests.exceptions.ConnectionError:
-            return False
-        with open(filepath, "wb") as f:
-            f.write(response.content)
-        time.sleep(1)
-        return True
-
-    def checkInterstitial(self, contents):
-        interstitial = re.findall('interstitial=([^"]+)', contents)
-        if interstitial:
-            return interstitial[0]
-        return False
-
-    def get_content(self, url):
-        # TODO: This needs refactor!
-        filepath = self.get_filepath(url)
-        for x in range(0, 3):
-            if not os.path.isfile(filepath) and not os.path.isfile(filepath + ".gz"):
-                contents = self.file_download(url, filepath)
-            else:
-                contents = self.file_read(filepath)
-            if contents is not None:
-                break
-            self.interface.print(f"...repeat request: {url}")
-            time.sleep(3)
-        if contents:
-            interstitial = self.checkInterstitial(contents)
-            if interstitial:
-                interstitial_url = (
-                    "http://" + self.crawler_url + "?interstitial=" + interstitial
-                )
-                self.file_download(
-                    interstitial_url, self.get_filepath(interstitial_url)
-                )
-                contents = self.file_download(
-                    "http://" + self.crawler_url,
-                    self.get_filepath("http://" + self.crawler_url),
-                )
-        return contents
-
-    def _fix_image_url(self, img: str) -> str:
-        if not img.startswith("http"):
-            uri = urlparse(self.url_to_crawl)
-            if uri.netloc not in img:
-                img = os.path.join(uri.netloc, img)
-            while not img.startswith("//"):
-                img = "/" + img
-            img = f"{uri.scheme}:{img}"
-        return img
-
-    def download_image(self, img: str) -> Optional[str]:
-        img = self._fix_image_url(img)
-        img_hash = self.get_urlhash(img)
-        img_type = os.path.splitext(img)[1].lower()
-        if img_type not in [".jpeg", ".jpg", ".png", ".bmp", ".gif", ".webp"]:
-            return None
-        original_fn = os.path.join(self.dirs.originals, img_hash + "." + img_type)
-        resized_fn = os.path.join(self.dirs.images, img_hash + ".jpg")
-        if os.path.isfile(resized_fn):
-            return img_hash + ".jpg"
-        if not os.path.isfile(resized_fn):
-            self.image_download(img, original_fn)
-        if os.path.isfile(original_fn):
-            original_img_type = imghdr.what(original_fn)
-            if original_img_type is None:
-                os.remove(original_fn)
-                return None
-            picture = Image.open(original_fn)
-            if (
-                picture.size[0] > self.images_size[0]
-                or picture.size[1] > self.images_size[1]
-            ):
-                picture.thumbnail(self.images_size, Image.LANCZOS)  # type: ignore
-            converted_picture = picture.convert("L")
-            converted_picture.save(
-                resized_fn, format="JPEG", quality=self.images_quality
-            )
-            os.remove(original_fn)
-            return img_hash + ".jpg"
-        return None
diff --git a/blog2epub/crawlers/blogspot.py b/blog2epub/crawlers/blogspot.py
@@ -6,17 +6,18 @@
 class BlogspotCrawler(DefaultCrawler):
     """Blogspot.com crawler."""
 
-    content_xpath = (
-        "//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
-    )
-    images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
-    articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
-
-    ignore_downloads = [
-        r"https:\/\/zblogowani\.pl\/[a-z]+\/[0-9]+x[0-9]+\/[a-z]+\/[0-9]+\/btn\.png",
-        r"https:\/\/www.blogger.com\/img\/blogger_logo_[a-z]+_[0-9]+\.png",
-        r"https:\/\/resources.blogblog.com\/img\/[a-z0-9_]+.gif",
-        r"https:\/\/www.paypalobjects.com\/[a-zA-Z_]+\/i\/scr\/pixel.gif",
-        r"https:\/\/resources.blogblog.com\/img\/widgets\/[a-zA-Z0-9\-\.]+",
-        r"https:\/\/[a-zA-Z0-9\.\/\-\_]+icon[0-9]+_[a-z]+.gif",
-    ]
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.content_xpath = (
+            "//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
+        )
+        self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
+        self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
+        self.ignore_downloads = [
+            r"https:\/\/zblogowani\.pl\/[a-z]+\/[0-9]+x[0-9]+\/[a-z]+\/[0-9]+\/btn\.png",
+            r"https:\/\/www.blogger.com\/img\/blogger_logo_[a-z]+_[0-9]+\.png",
+            r"https:\/\/resources.blogblog.com\/img\/[a-z0-9_]+.gif",
+            r"https:\/\/www.paypalobjects.com\/[a-zA-Z_]+\/i\/scr\/pixel.gif",
+            r"https:\/\/resources.blogblog.com\/img\/widgets\/[a-zA-Z0-9\-\.]+",
+            r"https:\/\/[a-zA-Z0-9\.\/\-\_]+icon[0-9]+_[a-z]+.gif",
+        ]