Skip to content

Commit

Permalink
stript_tags module and first working integration test
Browse files Browse the repository at this point in the history
  • Loading branch information
bohdanbobrowski committed Nov 5, 2024
1 parent 160fa0f commit 23336c2
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 126 deletions.
96 changes: 30 additions & 66 deletions blog2epub/blog2epub_cli.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,44 @@
import sys
from urllib import parse
import argparse

from blog2epub import Blog2Epub
from blog2epub.common.book import Book
from blog2epub.common.exceptions import BadUrlException, NotEnoughCommandsException
from blog2epub.common.interfaces import EmptyInterface
from blog2epub.models.configuration import ConfigurationModel


class CliInterface(EmptyInterface):
@staticmethod
def print(text: str):
def print(self, text: str):
print(text)

@staticmethod
def exception(e):
def exception(self, e):
print(e)


class Blog2EpubCli:
"""Command line interface for Blog2Epub class."""

def __init__(self, defaults={}):
params = {**defaults, **self.parse_parameters()}
blog2epub = Blog2Epub(params)
blog2epub.download()
book_data = blog2epub.crawler.get_book_data()
ebook = Book(
book_data=book_data,
configuration=ConfigurationModel(
language=blog2epub.crawler.language,
),
interface=params["interface"],
destination_folder=str("."),
)
ebook.save(book_data.articles)

@staticmethod
def get_url():
if len(sys.argv) > 1:
if parse.urlparse(sys.argv[1]):
return sys.argv[1]
raise BadUrlException("Blog url is not valid.")
raise NotEnoughCommandsException("Not enough command line parameters.")

def parse_parameters(self):
params = {"interface": CliInterface()}
try:
params["url"] = self.get_url()
except (BadUrlException, NotEnoughCommandsException) as e:
print(e)
print("usage: blog2epub <blog_name> [params...]")
exit()

params["url"] = sys.argv[1]

if "-n" in sys.argv or "--no-images" in sys.argv:
params["include_images"] = False
for arg in sys.argv:
if arg.find("-l=") == 0:
params["limit"] = int(arg.replace("-l=", ""))
if arg.find("--limit=") == 0:
params["limit"] = int(arg.replace("--limit=", ""))
if arg.find("-s=") == 0:
params["skip"] = int(arg.replace("-s=", ""))
if arg.find("--skip=") == 0:
params["skip"] = int(arg.replace("--skip=", ""))
if arg.find("-q=") == 0:
params["images_quality"] = int(arg.replace("-q=", ""))
if arg.find("--quality=") == 0:
params["images_quality"] = int(arg.replace("--quality=", ""))
return params


def main():
Blog2EpubCli()
parser = argparse.ArgumentParser(
prog="Blog2epub Cli interface",
description="Convert blog (blogspot.com, wordpress.com or another based on Wordpress) to epub using CLI or GUI.",
)
parser.add_argument("url", help="url of blog to download")
parser.add_argument("-l", "--limit", type=int, default=None, help="articles limit")
parser.add_argument(
"-s", "--skipped", type=int, default=None, help="number of skipped articles"
)
parser.add_argument("-o", "--output", help="output epub file name")
parser.add_argument("-d", "--debug", action="store_true", help="turn on debug")
args = parser.parse_args()

print(args)

# blog2epub = Blog2Epub()
# blog2epub.download()
# book_data = blog2epub.crawler.get_book_data()
# ebook = Book(
# book_data=book_data,
# configuration=ConfigurationModel(
# language=blog2epub.crawler.language,
# ),
# interface=params["interface"],
# destination_folder=str("."),
# )
# ebook.save(book_data.articles)


if __name__ == "__main__":
Expand Down
48 changes: 23 additions & 25 deletions blog2epub/crawlers/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from lxml.html.soupparser import fromstring
from lxml.etree import tostring
from strip_tags import strip_tags

from blog2epub.common.downloader import Downloader
import dateutil
Expand Down Expand Up @@ -67,9 +68,7 @@ def __init__(
self.cancelled = False
self.ignore_downloads: List[str] = []
self.article_class = Article
self.content_xpath = (
"//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
)
self.content_xpath = '//div[contains(@itemprop, "articleBody")]'
self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
self.downloader = Downloader(
Expand Down Expand Up @@ -125,8 +124,12 @@ def __init__(self, url, html, crawler: AbstractCrawler):
ignore_downloads=crawler.ignore_downloads,
)

def get_title(self):
self.title = html.unescape(self.title.strip())
def get_title(self) -> str:
title = self.tree.xpath('//meta[@property="og:title"]/@content')
if not title:
title = self.tree.xpath('//*[@class="post-title entry-title"]/text()')
title = title[0]
return html.unescape(title.strip())

def get_date(self):
if isinstance(self.date, datetime):
Expand Down Expand Up @@ -200,7 +203,7 @@ def process_images(self, images, ripper):
self.html = ripper(img=img, img_hash=img_hash, art_html=self.html)
self.images.append(img_hash)
self.images_captions.append(caption)
self.get_tree()
self.tree = fromstring(self.html)

def get_images(self):
self.process_images(self._find_images(), self._default_ripper)
Expand All @@ -213,12 +216,12 @@ def get_images(self):
)
self.process_images(self.tree.xpath("//img/@src"), self._img_ripper)
self.replace_images()
self.get_tree()
self.tree = fromstring(self.html)

def set_content(self, content):
self.content = content
self.html = content
self.get_tree()
self.tree = fromstring(self.html)

def replace_images(self):
for key, image in enumerate(self.images):
Expand All @@ -233,20 +236,15 @@ def replace_images(self):
self.html = self.html.replace("#blog2epubimage#" + image + "#", image_html)

def get_content(self):
self.content = self.tree.xpath(self.content_xpath)
if len(self.content) == 1:
self.content = tostring(self.content[0]).decode("utf-8")
self.content = re.sub('style="[^"]*"', "", self.content)
self.content = re.sub('class="[^"]*"', "", self.content)
for src in re.findall('<iframe.+? src="([^?= ]*)', self.content):
self.content = re.sub(
f"<iframe.+?{src}.+?/>",
f'<a href="{src}">{src}</a>',
self.content,
)

def get_tree(self):
self.tree = fromstring(self.html)
content_element = self.tree.xpath(self.content_xpath)
content_html = tostring(content_element[0])
content = strip_tags(
content_html,
["div"],
minify=True,
keep_tags=["a", "img", "p", "i", "b", "strong"],
)
return content

def get_tags(self):
tags = self.tree.xpath('//a[@rel="tag"]//text()')
Expand Down Expand Up @@ -295,10 +293,10 @@ def get_comments(self):
pass

def process(self):
self.get_tree()
self.get_title()
self.tree = fromstring(self.html)
self.title = self.get_title()
self.get_date()
self.get_images()
self.get_content()
self.content = self.get_content()
self.get_tags()
self.get_comments()
4 changes: 1 addition & 3 deletions blog2epub/crawlers/blogspot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ class BlogspotCrawler(DefaultCrawler):

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.content_xpath = (
"//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
)
self.content_xpath = '//div[contains(@itemprop, "articleBody")]'
self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
self.ignore_downloads = [
Expand Down
33 changes: 9 additions & 24 deletions blog2epub/crawlers/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,29 +172,6 @@ def _atom_feed_loop(self):
self.interface.print(str(e))
self.interface.print("[article not recognized - skipping]")

def _articles_loop(self, content):
pass
#
# self.article_counter += 1
# if not self.configuration.skip or (
# self.configuration.skip.isdigit()
# and self.article_counter > int(self.configuration.skip)
# ):
# art.process()
# self.images = self.images + art.images
# art_no = str(len(self.articles) + 1)
# self.interface.print(f"{art_no}. {art.title}")
# if self.start:
# self.end = art.date
# else:
# self.start = art.date
# self.articles.append(art)
# self._add_tags(art.tags)
# else:
# self.interface.print("[skipping] " + art.title)
# if self._break_the_loop():
# break

def _break_the_loop(self):
if (
self.cancelled
Expand Down Expand Up @@ -237,7 +214,15 @@ def crawl(self):
self.images = self.images + self._get_header_images(tree)
self.description = self._get_blog_description(tree)
self.title = self._get_blog_title(content)
art = self.article_class(page_url, content, )
art = self.article_class(page_url, content, self)
art.process()
self.images = self.images + art.images
if self.start:
self.end = art.date
else:
self.start = art.date
self.articles.append(art)
self.interface.print(f"{len(self.articles)}. {art.title}")
if self._break_the_loop():
break
else:
Expand Down
53 changes: 52 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ kivymd = "^1.2.0"
pydantic-yaml = "^1.3.0"
plyer = "^2.1.0"
pyjnius = "^1.6.1"
strip-tags = "^0.5.1"

[tool.poetry.group.docs.dependencies]
mkdocs = "^1.6.0"
Expand Down
13 changes: 6 additions & 7 deletions tests/integration/blog2epub/test_blog2epub_main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import tempfile

import pytest
import unittest
from blog2epub.blog2epub_main import Blog2Epub
from blog2epub.models.configuration import ConfigurationModel

Expand All @@ -10,19 +9,19 @@
def mock_configuration() -> ConfigurationModel:
return ConfigurationModel(
destination_folder=tempfile.gettempdir(),
limit=1,
limit="1",
)


class TestBlog2EPubMain(unittest.TestCase):
def test_download(self, mock_configuration):
class TestBlog2EPubMain:
def test_starybezpiek_downloads_one_article(self, mock_configuration):
# given
given_blog2epub = Blog2Epub(
url="https://starybezpiek.blogspot.com",
url="starybezpiek.blogspot.com",
configuration=mock_configuration,

)
# when
given_blog2epub.download()
# then
assert True
assert len(given_blog2epub.crawler.articles) == 1
assert len(given_blog2epub.crawler.images) > 1

0 comments on commit 23336c2

Please sign in to comment.