diff --git a/blog2epub/blog2epub_gui.py b/blog2epub/blog2epub_gui.py index 546f040..d8db389 100644 --- a/blog2epub/blog2epub_gui.py +++ b/blog2epub/blog2epub_gui.py @@ -1,5 +1,6 @@ import logging import os +import platform import re import sys import webbrowser @@ -17,7 +18,6 @@ from kivymd.uix.datatables import MDDataTable # type: ignore from kivymd.uix.tab import MDTabsBase, MDTabs # type: ignore from kivymd.uix.textfield import MDTextField # type: ignore -from kivy.utils import platform # type: ignore from plyer import filechooser, notification, email # type: ignore @@ -65,7 +65,6 @@ def get_previous(): now = datetime.now() date_time = now.strftime("%Y-%m-%d[%H.%M.%S]") - logging_filename = os.path.join( str(Path.home()), ".blog2epub", f"blog2epub_{date_time}.log" ) @@ -406,28 +405,6 @@ def _is_int(value) -> Optional[int]: except ValueError: return None - def _get_params(self): - if platform == "android": - cache_folder = self.user_data_dir - else: - cache_folder = os.path.join(str(Path.home()), ".blog2epub") - destination_folder = str(Path.home()) - return { - "interface": self.interface, - "url": self._get_url(), - "include_images": True, - "images_size": (600, 800), - "images_quality": 40, - "start": None, - "end": None, - "limit": self._is_int(self.limit_entry.text), - "skip": self._is_int(self.skip_entry.text), - "force_download": False, - "file_name": None, - "cache_folder": cache_folder, - "destination_folder": destination_folder, - } - def _download_ebook(self, blog2epub: Blog2Epub): blog2epub.download() self._enable_download_button() @@ -504,7 +481,23 @@ def download(self, instance): self.articles_table.update_row_data(self.articles_table, []) self.tab_select.disabled = True self.save_settings() - self.blog2epub = Blog2Epub(self._get_params()) + self.blog2epub = Blog2Epub( + { + "interface": self.interface, + "url": self._get_url(), + "include_images": True, + "images_size": (600, 800), + "images_quality": 40, + "start": None, + "end": None, + "limit": self._is_int(self.limit_entry.text), + "skip": self._is_int(self.skip_entry.text), + "force_download": False, + "file_name": None, + "cache_folder": os.path.join(str(Path.home()), ".blog2epub"), + "destination_folder": str(Path.home()), + } + ) self.download_thread = Thread( target=self._download_ebook, kwargs={"blog2epub": self.blog2epub}, @@ -578,7 +571,6 @@ def send_ebook_via_email(inst): orientation="horizontal", size_hint=(1, 0.1), spacing=sp(10), - # padding=sp(16), ) buttons_row.add_widget( @@ -638,12 +630,12 @@ def __init__(self): self.title = f"blog2epub - v. {Blog2Epub.version}" logging.info(self.title) logging.debug(f"Metrics.density = {Metrics.density}") - if platform == "linux": - self.icon = asset_path("blog2epub.svg") - elif platform == "win": + if platform.system() == "Darwin": + self.icon = asset_path("blog2epub.icns") + elif platform.system() == "Windows": self.icon = asset_path("blog2epub_256px.png") else: - self.icon = asset_path("blog2epub.icns") + self.icon = asset_path("blog2epub.svg") def build(self): self.theme_cls.theme_style = "Light" diff --git a/blog2epub/blog2epub_main.py b/blog2epub/blog2epub_main.py index a1e274a..87004a1 100755 --- a/blog2epub/blog2epub_main.py +++ b/blog2epub/blog2epub_main.py @@ -1,6 +1,3 @@ -from typing import Dict - -from blog2epub.common.exceptions import NoCrawlerDetectedError from blog2epub.common.globals import VERSION from blog2epub.crawlers import BlogspotCrawler, WordpressCrawler @@ -10,16 +7,10 @@ class Blog2Epub: version = VERSION - def __init__(self, params: Dict): - self.crawler = self.select_crawler(params) - if self.crawler is None: - raise NoCrawlerDetectedError("No crawler detected") - - @staticmethod - def select_crawler(params: Dict): - if params["url"].find(".blogspot.") > -1: - return BlogspotCrawler(**params) - return WordpressCrawler(**params) + def __init__(self, **kwargs): + if kwargs.get("url").find(".blogspot.") > -1: + self.crawler = BlogspotCrawler(**kwargs) + self.crawler = WordpressCrawler(**kwargs) def download(self): self.crawler.crawl() diff --git a/blog2epub/common/book.py b/blog2epub/common/book.py index a1bbd1f..85ae47a 100755 --- a/blog2epub/common/book.py +++ b/blog2epub/common/book.py @@ -76,8 +76,8 @@ def __init__( self, book_data: Any, configuration: ConfigurationModel, - interface: EmptyInterface, - destination_folder: str, + interface: EmptyInterface = EmptyInterface(), + destination_folder: str = ".", ): self.title = book_data.title self.description = book_data.description diff --git a/blog2epub/common/interfaces.py b/blog2epub/common/interfaces.py index d8f8bea..117da9e 100644 --- a/blog2epub/common/interfaces.py +++ b/blog2epub/common/interfaces.py @@ -1,13 +1,8 @@ -from abc import ABC, abstractmethod - - -class EmptyInterface(ABC): +class EmptyInterface: """Empty interface for script output.""" - @abstractmethod def print(self, text: str): - pass + print(text) - @abstractmethod def exception(self, **kwargs): - pass + print(kwargs) diff --git a/blog2epub/crawlers/abstract.py b/blog2epub/crawlers/abstract.py index 1206542..ab52f01 100644 --- a/blog2epub/crawlers/abstract.py +++ b/blog2epub/crawlers/abstract.py @@ -1,12 +1,61 @@ +import os from abc import ABC, abstractmethod -from typing import List, Optional +from datetime import datetime +from pathlib import Path +from typing import List, Optional, Dict -from blog2epub.models.book import ArticleModel +from blog2epub.common.book import Book +from blog2epub.common.interfaces import EmptyInterface +from blog2epub.crawlers.default import Article, Downloader +from blog2epub.models.book import ArticleModel, DirModel +from blog2epub.models.configuration import ConfigurationModel +from blog2epub.common.crawler import ( + prepare_file_name, + prepare_port, + prepare_url_to_crawl, +) class AbstractCrawler(ABC): ignore_downloads: List[str] = [] + def __init__( + self, + url: str, + configuration: ConfigurationModel, + start: Optional[datetime] = None, + end: Optional[datetime] = None, + file_name: Optional[str] = None, + cache_folder: str = os.path.join(str(Path.home()), ".blog2epub"), + interface: EmptyInterface = EmptyInterface(), + ): + super().__init__() + self.url = url + self.configuration = configuration + self.url_to_crawl = prepare_url_to_crawl(self.url) + self.port = prepare_port(self.url_to_crawl) + self.file_name = prepare_file_name(file_name, self.url) + self.cache_folder = cache_folder + self.start = start + self.end = end + self.interface = interface + self.dirs = DirModel( + path=str(os.path.join(self.cache_folder, self.url.replace("/", "_"))), + ) + self.book: Optional[Book] + self.title = None + self.subtitle = None + self.description = None + self.language: str | None = self.configuration.language + self.atom_feed = False + self.articles: List[Article] = [] + self.article_counter = 0 + self.images: List[str] = [] + self.downloader = Downloader(self) + self.tags: Dict = {} + self.active = False + self.cancelled = False + @abstractmethod def crawl(self): pass diff --git a/blog2epub/crawlers/default.py b/blog2epub/crawlers/default.py index a3eb07f..fa13ccb 100644 --- a/blog2epub/crawlers/default.py +++ b/blog2epub/crawlers/default.py @@ -10,8 +10,7 @@ import time from datetime import datetime from http.cookiejar import CookieJar -from pathlib import Path -from typing import Optional, List, Dict +from typing import Optional, List from urllib.parse import urlparse import atoma # type: ignore @@ -25,13 +24,6 @@ from blog2epub.common.downloader import prepare_directories from blog2epub.crawlers.abstract import AbstractCrawler from blog2epub.common.book import Book -from blog2epub.common.crawler import ( - prepare_file_name, - prepare_port, - prepare_url, - prepare_url_to_crawl, -) -from blog2epub.common.interfaces import EmptyInterface from blog2epub.models.book import BookModel, DirModel, ArticleModel, ImageModel from blog2epub.models.configuration import ConfigurationModel @@ -51,55 +43,6 @@ class DefaultCrawler(AbstractCrawler): ignore_downloads = [] - def __init__( - self, - url, - include_images: bool = True, - images_size: tuple = (600, 800), - images_quality: int = 40, - start=None, - end=None, - limit: Optional[int] = None, - skip: Optional[int] = None, - force_download: bool = False, - file_name: Optional[str] = None, - destination_folder: str = "./", - cache_folder: str = os.path.join(str(Path.home()), ".blog2epub"), - language: Optional[str] = None, - interface=None, - ): - self.url = prepare_url(url) - self.url_to_crawl = prepare_url_to_crawl(self.url) - self.port = prepare_port(self.url_to_crawl) - self.file_name = prepare_file_name(file_name, self.url) - self.destination_folder = destination_folder - self.cache_folder = cache_folder - self.include_images = include_images - self.images_quality = images_quality - self.images_size = images_size - self.start = start - self.end = end - self.limit = limit - self.skip = skip - self.force_download = force_download - self.interface = self._get_the_interface(interface) - self.dirs = DirModel( - path=str(os.path.join(self.cache_folder, self.url.replace("/", "_"))), - ) - self.book: Optional[Book] - self.title = None - self.subtitle = None - self.description = None - self.language: str | None = language - self.atom_feed = False - self.articles: List[Article] = [] - self.article_counter = 0 - self.images: List[str] = [] - self.downloader = Downloader(self) - self.tags: Dict = {} - self.active = False - self.cancelled = False - def _get_articles_list(self) -> List[ArticleModel]: """This is temporary solution - crawler should use data models as default data storage.""" articles_list = [] @@ -141,42 +84,20 @@ def get_book_data(self) -> BookModel: start=self.start, end=self.end, file_name_prefix=self.file_name, - destination_folder=self.destination_folder, + destination_folder=self.configuration.destination_folder, cover=None, cover_image_path=None, ) return book_data - @staticmethod - def _get_the_interface(interface): - if interface: - return interface - return EmptyInterface() - def _get_subtitle(self): if self.end is None: - return self.start.strftime("%d %B %Y") + return self.start.strftime("%d.%B.%Y") if self.start.strftime("%Y.%m") == self.end.strftime("%Y.%m"): - return self.end.strftime("%d") + "-" + self.start.strftime("%d %B %Y") + return self.end.strftime("%d") + "-" + self.start.strftime("%d.%B.%Y") if self.start.strftime("%Y.%m") == self.end.strftime("%Y.%m"): - return self.end.strftime("%d %B") + " - " + self.start.strftime("%d %B %Y") - return self.end.strftime("%d %B %Y") + " - " + self.start.strftime("%d %B %Y") - - def get_cover_title(self): - cover_title = self.title + " " - if self.start == self.end: - cover_title = cover_title + str(self.start) - else: - end_date = self.end.split(" ") - start_date = self.start.split(" ") - if len(end_date) == len(start_date): - ed = [] - for i, d in enumerate(end_date): - if d != start_date[i]: - ed.append(d) - ed = " ".join(ed) - cover_title = cover_title + ed + "-" + self.start - return cover_title + return self.end.strftime("%d.%B") + " - " + self.start.strftime("%d.%B.%Y") + return self.end.strftime("%d.%B.%Y") + " - " + self.start.strftime("%d.%B.%Y") @staticmethod def get_date(str_date): @@ -265,7 +186,10 @@ def _atom_feed_loop(self): art = eval(self.article_class)( item.links[0].href, item.title.value, self ) - if self.skip and self.article_counter < self.skip: + if ( + self.configuration.skip + and self.article_counter < self.configuration.skip + ): self.interface.print("[skipping] " + art.title) continue art_no = len(self.articles) + 1 @@ -282,7 +206,9 @@ def _atom_feed_loop(self): self.images = self.images + art.images self.articles.append(art) self._add_tags(art.tags) - if self.limit and len(self.articles) >= self.limit: + if self.configuration.limit and len(self.articles) >= int( + self.configuration.limit + ): break except AttributeError as e: self.interface.print(str(e)) @@ -291,7 +217,10 @@ def _atom_feed_loop(self): def _articles_loop(self, content): for art in self._get_articles(content): self.article_counter += 1 - if not self.skip or self.article_counter > self.skip: + if ( + not self.configuration.skip + or self.article_counter > self.configuration.skip + ): art.process() self.images = self.images + art.images self.interface.print(str(len(self.articles) + 1) + ". " + art.title) @@ -308,7 +237,7 @@ def _articles_loop(self, content): break def _check_limit(self): - if self.limit and len(self.articles) >= self.limit: + if self.configuration.limit and len(self.articles) >= self.configuration.limit: self.url_to_crawl = None def _prepare_content(self, content): @@ -325,7 +254,7 @@ def crawl(self): self.title = self._get_blog_title(content) content = self._prepare_content(content) self._articles_loop(content) - if not self.skip and len(self.articles) == 0: + if not self.configuration.skip and len(self.articles) == 0: self._get_atom_content() self._atom_feed_loop() self.url_to_crawl = self._get_url_to_crawl(tree) @@ -366,7 +295,6 @@ def __init__(self, crawler): self.crawler_url = crawler.url self.crawler_port = crawler.port self.interface = crawler.interface - self.force_download = crawler.force_download self.images_size = crawler.images_size self.images_quality = crawler.images_quality self.ignore_downloads = crawler.ignore_downloads @@ -441,9 +369,7 @@ def get_content(self, url): # TODO: This needs refactor! filepath = self.get_filepath(url) for x in range(0, 3): - if self.force_download or ( - not os.path.isfile(filepath) and not os.path.isfile(filepath + ".gz") - ): + if not os.path.isfile(filepath) and not os.path.isfile(filepath + ".gz"): contents = self.file_download(url, filepath) else: contents = self.file_read(filepath) @@ -451,7 +377,7 @@ def get_content(self, url): break self.interface.print(f"...repeat request: {url}") time.sleep(3) - if contents is not None: + if contents: interstitial = self.checkInterstitial(contents) if interstitial: interstitial_url = ( @@ -486,7 +412,7 @@ def download_image(self, img: str) -> Optional[str]: resized_fn = os.path.join(self.dirs.images, img_hash + ".jpg") if os.path.isfile(resized_fn): return img_hash + ".jpg" - if not os.path.isfile(resized_fn) or self.force_download: + if not os.path.isfile(resized_fn): self.image_download(img, original_fn) if os.path.isfile(original_fn): original_img_type = imghdr.what(original_fn)