stript_tags module and first working integration test

bohdanbobrowski · Nov 5, 2024 · 23336c2 · 23336c2
1 parent 160fa0f
commit 23336c2
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 126 deletions.
diff --git a/blog2epub/blog2epub_cli.py b/blog2epub/blog2epub_cli.py
@@ -1,80 +1,44 @@
-import sys
-from urllib import parse
+import argparse
 
-from blog2epub import Blog2Epub
-from blog2epub.common.book import Book
-from blog2epub.common.exceptions import BadUrlException, NotEnoughCommandsException
 from blog2epub.common.interfaces import EmptyInterface
-from blog2epub.models.configuration import ConfigurationModel
 
 
 class CliInterface(EmptyInterface):
-    @staticmethod
-    def print(text: str):
+    def print(self, text: str):
         print(text)
 
-    @staticmethod
-    def exception(e):
+    def exception(self, e):
         print(e)
 
 
-class Blog2EpubCli:
-    """Command line interface for Blog2Epub class."""
-
-    def __init__(self, defaults={}):
-        params = {**defaults, **self.parse_parameters()}
-        blog2epub = Blog2Epub(params)
-        blog2epub.download()
-        book_data = blog2epub.crawler.get_book_data()
-        ebook = Book(
-            book_data=book_data,
-            configuration=ConfigurationModel(
-                language=blog2epub.crawler.language,
-            ),
-            interface=params["interface"],
-            destination_folder=str("."),
-        )
-        ebook.save(book_data.articles)
-
-    @staticmethod
-    def get_url():
-        if len(sys.argv) > 1:
-            if parse.urlparse(sys.argv[1]):
-                return sys.argv[1]
-            raise BadUrlException("Blog url is not valid.")
-        raise NotEnoughCommandsException("Not enough command line parameters.")
-
-    def parse_parameters(self):
-        params = {"interface": CliInterface()}
-        try:
-            params["url"] = self.get_url()
-        except (BadUrlException, NotEnoughCommandsException) as e:
-            print(e)
-            print("usage: blog2epub <blog_name> [params...]")
-            exit()
-
-        params["url"] = sys.argv[1]
-
-        if "-n" in sys.argv or "--no-images" in sys.argv:
-            params["include_images"] = False
-        for arg in sys.argv:
-            if arg.find("-l=") == 0:
-                params["limit"] = int(arg.replace("-l=", ""))
-            if arg.find("--limit=") == 0:
-                params["limit"] = int(arg.replace("--limit=", ""))
-            if arg.find("-s=") == 0:
-                params["skip"] = int(arg.replace("-s=", ""))
-            if arg.find("--skip=") == 0:
-                params["skip"] = int(arg.replace("--skip=", ""))
-            if arg.find("-q=") == 0:
-                params["images_quality"] = int(arg.replace("-q=", ""))
-            if arg.find("--quality=") == 0:
-                params["images_quality"] = int(arg.replace("--quality=", ""))
-        return params
-
-
 def main():
-    Blog2EpubCli()
+    parser = argparse.ArgumentParser(
+        prog="Blog2epub Cli interface",
+        description="Convert blog (blogspot.com, wordpress.com or another based on Wordpress) to epub using CLI or GUI.",
+    )
+    parser.add_argument("url", help="url of blog to download")
+    parser.add_argument("-l", "--limit", type=int, default=None, help="articles limit")
+    parser.add_argument(
+        "-s", "--skipped", type=int, default=None, help="number of skipped articles"
+    )
+    parser.add_argument("-o", "--output", help="output epub file name")
+    parser.add_argument("-d", "--debug", action="store_true", help="turn on debug")
+    args = parser.parse_args()
+
+    print(args)
+
+    # blog2epub = Blog2Epub()
+    # blog2epub.download()
+    # book_data = blog2epub.crawler.get_book_data()
+    # ebook = Book(
+    #     book_data=book_data,
+    #     configuration=ConfigurationModel(
+    #         language=blog2epub.crawler.language,
+    #     ),
+    #     interface=params["interface"],
+    #     destination_folder=str("."),
+    # )
+    # ebook.save(book_data.articles)
 
 
 if __name__ == "__main__":

diff --git a/blog2epub/crawlers/abstract.py b/blog2epub/crawlers/abstract.py
@@ -7,6 +7,7 @@
 
 from lxml.html.soupparser import fromstring
 from lxml.etree import tostring
+from strip_tags import strip_tags
 
 from blog2epub.common.downloader import Downloader
 import dateutil
@@ -67,9 +68,7 @@ def __init__(
         self.cancelled = False
         self.ignore_downloads: List[str] = []
         self.article_class = Article
-        self.content_xpath = (
-            "//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
-        )
+        self.content_xpath = '//div[contains(@itemprop, "articleBody")]'
         self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
         self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
         self.downloader = Downloader(
@@ -125,8 +124,12 @@ def __init__(self, url, html, crawler: AbstractCrawler):
             ignore_downloads=crawler.ignore_downloads,
         )
 
-    def get_title(self):
-        self.title = html.unescape(self.title.strip())
+    def get_title(self) -> str:
+        title = self.tree.xpath('//meta[@property="og:title"]/@content')
+        if not title:
+            title = self.tree.xpath('//*[@class="post-title entry-title"]/text()')
+        title = title[0]
+        return html.unescape(title.strip())
 
     def get_date(self):
         if isinstance(self.date, datetime):
@@ -200,7 +203,7 @@ def process_images(self, images, ripper):
                     self.html = ripper(img=img, img_hash=img_hash, art_html=self.html)
                     self.images.append(img_hash)
                     self.images_captions.append(caption)
-        self.get_tree()
+        self.tree = fromstring(self.html)
 
     def get_images(self):
         self.process_images(self._find_images(), self._default_ripper)
@@ -213,12 +216,12 @@ def get_images(self):
         )
         self.process_images(self.tree.xpath("//img/@src"), self._img_ripper)
         self.replace_images()
-        self.get_tree()
+        self.tree = fromstring(self.html)
 
     def set_content(self, content):
         self.content = content
         self.html = content
-        self.get_tree()
+        self.tree = fromstring(self.html)
 
     def replace_images(self):
         for key, image in enumerate(self.images):
@@ -233,20 +236,15 @@ def replace_images(self):
             self.html = self.html.replace("#blog2epubimage#" + image + "#", image_html)
 
     def get_content(self):
-        self.content = self.tree.xpath(self.content_xpath)
-        if len(self.content) == 1:
-            self.content = tostring(self.content[0]).decode("utf-8")
-            self.content = re.sub('style="[^"]*"', "", self.content)
-            self.content = re.sub('class="[^"]*"', "", self.content)
-            for src in re.findall('<iframe.+? src="([^?= ]*)', self.content):
-                self.content = re.sub(
-                    f"<iframe.+?{src}.+?/>",
-                    f'<a href="{src}">{src}</a>',
-                    self.content,
-                )
-
-    def get_tree(self):
-        self.tree = fromstring(self.html)
+        content_element = self.tree.xpath(self.content_xpath)
+        content_html = tostring(content_element[0])
+        content = strip_tags(
+            content_html,
+            ["div"],
+            minify=True,
+            keep_tags=["a", "img", "p", "i", "b", "strong"],
+        )
+        return content
 
     def get_tags(self):
         tags = self.tree.xpath('//a[@rel="tag"]//text()')
@@ -295,10 +293,10 @@ def get_comments(self):
                 pass
 
     def process(self):
-        self.get_tree()
-        self.get_title()
+        self.tree = fromstring(self.html)
+        self.title = self.get_title()
         self.get_date()
         self.get_images()
-        self.get_content()
+        self.content = self.get_content()
         self.get_tags()
         self.get_comments()
diff --git a/blog2epub/crawlers/blogspot.py b/blog2epub/crawlers/blogspot.py
@@ -8,9 +8,7 @@ class BlogspotCrawler(DefaultCrawler):
 
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.content_xpath = (
-            "//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
-        )
+        self.content_xpath = '//div[contains(@itemprop, "articleBody")]'
         self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
         self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
         self.ignore_downloads = [

diff --git a/blog2epub/crawlers/default.py b/blog2epub/crawlers/default.py
@@ -172,29 +172,6 @@ def _atom_feed_loop(self):
                 self.interface.print(str(e))
                 self.interface.print("[article not recognized - skipping]")
 
-    def _articles_loop(self, content):
-        pass
-        #
-        # self.article_counter += 1
-        # if not self.configuration.skip or (
-        #     self.configuration.skip.isdigit()
-        #     and self.article_counter > int(self.configuration.skip)
-        # ):
-        #     art.process()
-        #     self.images = self.images + art.images
-        #     art_no = str(len(self.articles) + 1)
-        #     self.interface.print(f"{art_no}. {art.title}")
-        #     if self.start:
-        #         self.end = art.date
-        #     else:
-        #         self.start = art.date
-        #     self.articles.append(art)
-        #     self._add_tags(art.tags)
-        # else:
-        #     self.interface.print("[skipping] " + art.title)
-        # if self._break_the_loop():
-        #     break
-
     def _break_the_loop(self):
         if (
             self.cancelled
@@ -237,7 +214,15 @@ def crawl(self):
                     self.images = self.images + self._get_header_images(tree)
                     self.description = self._get_blog_description(tree)
                     self.title = self._get_blog_title(content)
-                art = self.article_class(page_url, content, )
+                art = self.article_class(page_url, content, self)
+                art.process()
+                self.images = self.images + art.images
+                if self.start:
+                    self.end = art.date
+                else:
+                    self.start = art.date
+                self.articles.append(art)
+                self.interface.print(f"{len(self.articles)}. {art.title}")
                 if self._break_the_loop():
                     break
         else:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,7 @@ kivymd = "^1.2.0"
 pydantic-yaml = "^1.3.0"
 plyer = "^2.1.0"
 pyjnius = "^1.6.1"
+strip-tags = "^0.5.1"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.6.0"

diff --git a/tests/integration/blog2epub/test_blog2epub_main.py b/tests/integration/blog2epub/test_blog2epub_main.py
@@ -1,7 +1,6 @@
 import tempfile
 
 import pytest
-import unittest
 from blog2epub.blog2epub_main import Blog2Epub
 from blog2epub.models.configuration import ConfigurationModel
 
@@ -10,19 +9,19 @@
 def mock_configuration() -> ConfigurationModel:
     return ConfigurationModel(
         destination_folder=tempfile.gettempdir(),
-        limit=1,
+        limit="1",
     )
 
 
-class TestBlog2EPubMain(unittest.TestCase):
-    def test_download(self, mock_configuration):
+class TestBlog2EPubMain:
+    def test_starybezpiek_downloads_one_article(self, mock_configuration):
         # given
         given_blog2epub = Blog2Epub(
-            url="https://starybezpiek.blogspot.com",
+            url="starybezpiek.blogspot.com",
             configuration=mock_configuration,
-
         )
         # when
         given_blog2epub.download()
         # then
-        assert True
+        assert len(given_blog2epub.crawler.articles) == 1
+        assert len(given_blog2epub.crawler.images) > 1