Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
bohdanbobrowski committed Sep 5, 2024
1 parent 7f83b27 commit 3b626a0
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 230 deletions.
21 changes: 5 additions & 16 deletions blog2epub/blog2epub_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from itertools import cycle
from pathlib import Path
from threading import Thread
from typing import Optional, List
from typing import List
from urllib import parse

from kivy.uix.anchorlayout import AnchorLayout # type: ignore
Expand Down Expand Up @@ -398,14 +398,6 @@ def _get_url(self):
return self.url_entry.text
raise BadUrlException("Blog url is not valid.")

@staticmethod
def _is_int(value) -> Optional[int]:
try:
int(value)
return int(value)
except ValueError:
return None

def _download_ebook(self, blog2epub: Blog2Epub):
blog2epub.download()
self._enable_download_button()
Expand Down Expand Up @@ -484,23 +476,20 @@ def download(self, instance):
self.save_settings()
configuration = ConfigurationModel(
**{
"include_images": True,
"images_size": (600, 800),
"images_quality": 40,
"limit": self._is_int(self.limit_entry.text),
"skip": self._is_int(self.skip_entry.text),
"limit": self.limit_entry.text,
"skip": self.limit_entry.text,
"destination_folder": str(Path.home()),
}
)
self.blog2epub = Blog2Epub(
**{
"interface": self.interface,
"url": self._get_url(),
"configuration": configuration,
"start": None,
"end": None,
"file_name": None,
"cache_folder": os.path.join(str(Path.home()), ".blog2epub"),
"destination_folder": str(Path.home()),
"interface": self.interface,
}
)
self.download_thread = Thread(
Expand Down
9 changes: 7 additions & 2 deletions blog2epub/blog2epub_main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging

from blog2epub.common.globals import VERSION
from blog2epub.crawlers import BlogspotCrawler, WordpressCrawler
from blog2epub.crawlers import BlogspotCrawler, WordpressCrawler, GenericCrawler


class Blog2Epub:
Expand All @@ -10,7 +12,10 @@ class Blog2Epub:
def __init__(self, **kwargs):
if kwargs.get("url").find(".blogspot.") > -1:
self.crawler = BlogspotCrawler(**kwargs)
self.crawler = WordpressCrawler(**kwargs)
if kwargs.get("url").find(".wordpress.com") > -1:
self.crawler = WordpressCrawler(**kwargs)
self.crawler = GenericCrawler(**kwargs)


def download(self):
self.crawler.crawl()
35 changes: 22 additions & 13 deletions blog2epub/common/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
import re

from http.cookiejar import CookieJar
from typing import Optional
from typing import Optional, List
from urllib.parse import urlparse
import time
from PIL import Image

import requests

from blog2epub.models.book import DirModel
from common.interfaces import EmptyInterface


def prepare_directories(dirs: DirModel):
Expand All @@ -23,15 +24,23 @@ def prepare_directories(dirs: DirModel):


class Downloader:
def __init__(self, crawler):
self.dirs = crawler.dirs
self.url_to_crawl = crawler.url_to_crawl
self.crawler_url = crawler.url
self.crawler_port = crawler.port
self.interface = crawler.interface
self.images_size = crawler.images_size
self.images_quality = crawler.images_quality
self.ignore_downloads = crawler.ignore_downloads
def __init__(
self,
dirs: DirModel,
url: str,
url_to_crawl: str,
interface: EmptyInterface,
images_size: List[int],
images_quality: int,
ignore_downloads: List[str],
):
self.dirs = dirs
self.url = url
self.url_to_crawl = url_to_crawl
self.interface = interface
self.images_size = images_size
self.images_quality = images_quality
self.ignore_downloads = ignore_downloads
self.cookies = CookieJar()
self.session = requests.session()
self.headers = {}
Expand Down Expand Up @@ -115,14 +124,14 @@ def get_content(self, url):
interstitial = self.checkInterstitial(contents)
if interstitial:
interstitial_url = (
"http://" + self.crawler_url + "?interstitial=" + interstitial
"http://" + self.url + "?interstitial=" + interstitial
)
self.file_download(
interstitial_url, self.get_filepath(interstitial_url)
)
contents = self.file_download(
"http://" + self.crawler_url,
self.get_filepath("http://" + self.crawler_url),
"http://" + self.url,
self.get_filepath("http://" + self.url),
)
return contents

Expand Down
2 changes: 1 addition & 1 deletion blog2epub/common/globals.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "1.3.1"
VERSION = "1.4.0"
175 changes: 17 additions & 158 deletions blog2epub/crawlers/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,11 @@
from pathlib import Path
from typing import List, Optional, Dict
from xml import etree
import gzip
import hashlib
import imghdr
import re
import time
from http.cookiejar import CookieJar
from urllib.parse import urlparse

import requests
from lxml.html.soupparser import fromstring
from PIL import Image

from blog2epub.common.downloader import prepare_directories
from blog2epub.common.downloader import Downloader
import dateutil

from blog2epub.common.book import Book
Expand All @@ -33,8 +25,6 @@


class AbstractCrawler(ABC):
ignore_downloads: List[str] = []

def __init__(
self,
url: str,
Expand Down Expand Up @@ -67,10 +57,25 @@ def __init__(
self.articles: List[Article] = []
self.article_counter = 0
self.images: List[str] = []
self.downloader = Downloader(self)
self.tags: Dict = {}
self.active = False
self.cancelled = False
self.ignore_downloads: List[str] = []
self.article_class = "Article"
self.content_xpath = (
"//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
)
self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
self.downloader = Downloader(
dirs=self.dirs,
url=self.url,
url_to_crawl=self.url_to_crawl,
interface=self.interface,
images_size=self.configuration.images_size,
images_quality=self.configuration.images_quality,
ignore_downloads=self.ignore_downloads,
)

@abstractmethod
def crawl(self):
Expand Down Expand Up @@ -333,149 +338,3 @@ def process(self):
self.get_content()
self.get_tags()
self.get_comments()


class Downloader:
def __init__(self, crawler):
self.dirs = crawler.dirs
self.url_to_crawl = crawler.url_to_crawl
self.crawler_url = crawler.url
self.crawler_port = crawler.port
self.interface = crawler.interface
self.images_size = crawler.images_size
self.images_quality = crawler.images_quality
self.ignore_downloads = crawler.ignore_downloads
self.cookies = CookieJar()
self.session = requests.session()
self.headers = {}

def get_urlhash(self, url):
m = hashlib.md5()
m.update(url.encode("utf-8"))
return m.hexdigest()

def file_write(self, contents, filepath):
filepath = filepath + ".gz"
with gzip.open(filepath, "wb") as f:
f.write(contents.encode("utf-8"))

def file_read(self, filepath):
if os.path.isfile(filepath + ".gz"):
with gzip.open(filepath + ".gz", "rb") as f:
contents = f.read().decode("utf-8")
else:
with open(filepath, "rb") as html_file:
contents = html_file.read().decode("utf-8")
self.file_write(contents, filepath)
os.remove(filepath)
return contents

def get_filepath(self, url):
return os.path.join(self.dirs.html, self.get_urlhash(url) + ".html")

def _is_url_in_ignored(self, url) -> bool:
for search_rule in self.ignore_downloads:
if re.match(search_rule, url):
return True
return False

def file_download(self, url: str, filepath: str) -> Optional[str]:
if self._is_url_in_ignored(url):
return None
prepare_directories(self.dirs)
try:
response = self.session.get(url, cookies=self.cookies, headers=self.headers)
except requests.exceptions.ConnectionError:
return None
self.cookies = response.cookies
data = response.content
contents = data.decode("utf-8")
self.file_write(contents, filepath)
return contents

def image_download(self, url: str, filepath: str) -> bool | None:
if self._is_url_in_ignored(url):
return None
prepare_directories(self.dirs)
try:
response = self.session.get(url, cookies=self.cookies, headers=self.headers)
except requests.exceptions.ConnectionError:
return False
with open(filepath, "wb") as f:
f.write(response.content)
time.sleep(1)
return True

def checkInterstitial(self, contents):
interstitial = re.findall('interstitial=([^"]+)', contents)
if interstitial:
return interstitial[0]
return False

def get_content(self, url):
# TODO: This needs refactor!
filepath = self.get_filepath(url)
for x in range(0, 3):
if not os.path.isfile(filepath) and not os.path.isfile(filepath + ".gz"):
contents = self.file_download(url, filepath)
else:
contents = self.file_read(filepath)
if contents is not None:
break
self.interface.print(f"...repeat request: {url}")
time.sleep(3)
if contents:
interstitial = self.checkInterstitial(contents)
if interstitial:
interstitial_url = (
"http://" + self.crawler_url + "?interstitial=" + interstitial
)
self.file_download(
interstitial_url, self.get_filepath(interstitial_url)
)
contents = self.file_download(
"http://" + self.crawler_url,
self.get_filepath("http://" + self.crawler_url),
)
return contents

def _fix_image_url(self, img: str) -> str:
if not img.startswith("http"):
uri = urlparse(self.url_to_crawl)
if uri.netloc not in img:
img = os.path.join(uri.netloc, img)
while not img.startswith("//"):
img = "/" + img
img = f"{uri.scheme}:{img}"
return img

def download_image(self, img: str) -> Optional[str]:
img = self._fix_image_url(img)
img_hash = self.get_urlhash(img)
img_type = os.path.splitext(img)[1].lower()
if img_type not in [".jpeg", ".jpg", ".png", ".bmp", ".gif", ".webp"]:
return None
original_fn = os.path.join(self.dirs.originals, img_hash + "." + img_type)
resized_fn = os.path.join(self.dirs.images, img_hash + ".jpg")
if os.path.isfile(resized_fn):
return img_hash + ".jpg"
if not os.path.isfile(resized_fn):
self.image_download(img, original_fn)
if os.path.isfile(original_fn):
original_img_type = imghdr.what(original_fn)
if original_img_type is None:
os.remove(original_fn)
return None
picture = Image.open(original_fn)
if (
picture.size[0] > self.images_size[0]
or picture.size[1] > self.images_size[1]
):
picture.thumbnail(self.images_size, Image.LANCZOS) # type: ignore
converted_picture = picture.convert("L")
converted_picture.save(
resized_fn, format="JPEG", quality=self.images_quality
)
os.remove(original_fn)
return img_hash + ".jpg"
return None
29 changes: 15 additions & 14 deletions blog2epub/crawlers/blogspot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@
class BlogspotCrawler(DefaultCrawler):
"""Blogspot.com crawler."""

content_xpath = (
"//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
)
images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"

ignore_downloads = [
r"https:\/\/zblogowani\.pl\/[a-z]+\/[0-9]+x[0-9]+\/[a-z]+\/[0-9]+\/btn\.png",
r"https:\/\/www.blogger.com\/img\/blogger_logo_[a-z]+_[0-9]+\.png",
r"https:\/\/resources.blogblog.com\/img\/[a-z0-9_]+.gif",
r"https:\/\/www.paypalobjects.com\/[a-zA-Z_]+\/i\/scr\/pixel.gif",
r"https:\/\/resources.blogblog.com\/img\/widgets\/[a-zA-Z0-9\-\.]+",
r"https:\/\/[a-zA-Z0-9\.\/\-\_]+icon[0-9]+_[a-z]+.gif",
]
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.content_xpath = (
"//div[contains(concat(' ',normalize-space(@class),' '),'post-body')]"
)
self.images_regex = r'<table[^>]*><tbody>[\s]*<tr><td[^>]*><a href="([^"]*)"[^>]*><img[^>]*></a></td></tr>[\s]*<tr><td class="tr-caption" style="[^"]*">([^<]*)'
self.articles_regex = r"<h3 class=\'post-title entry-title\' itemprop=\'name\'>[\s]*<a href=\'([^\']*)\'>([^>^<]*)</a>[\s]*</h3>"
self.ignore_downloads = [
r"https:\/\/zblogowani\.pl\/[a-z]+\/[0-9]+x[0-9]+\/[a-z]+\/[0-9]+\/btn\.png",
r"https:\/\/www.blogger.com\/img\/blogger_logo_[a-z]+_[0-9]+\.png",
r"https:\/\/resources.blogblog.com\/img\/[a-z0-9_]+.gif",
r"https:\/\/www.paypalobjects.com\/[a-zA-Z_]+\/i\/scr\/pixel.gif",
r"https:\/\/resources.blogblog.com\/img\/widgets\/[a-zA-Z0-9\-\.]+",
r"https:\/\/[a-zA-Z0-9\.\/\-\_]+icon[0-9]+_[a-z]+.gif",
]
Loading

0 comments on commit 3b626a0

Please sign in to comment.