chore: hijack requests to fake a user agent

potatoeggy · Oct 16, 2024 · 1ff77e6 · 1ff77e6
1 parent 5a7e5e3
commit 1ff77e6
Show file tree

Hide file tree

Showing 10 changed files with 25 additions and 59 deletions.
diff --git a/mandown/io.py b/mandown/io.py
@@ -6,7 +6,7 @@
 from typing import Iterator, Sequence
 
 import filetype
-import requests
+import requests as RealRequests
 from natsort import natsorted
 
 from .base import BaseChapter, BaseMetadata
@@ -31,7 +31,7 @@ def async_download_image(data: AsyncDownloadImageInput) -> None:
     name = filename or url.split("/")[-1]
     dest_file = dest_folder / name
 
-    res = requests.get(url, headers=headers, timeout=5)
+    res = RealRequests.get(url, headers=headers, timeout=5)
 
     if res.status_code != 200:
         # there is no clean way to raise an error in a pool
@@ -177,3 +177,12 @@ def discover_local_images(path: Path | str) -> dict[str, list[Path]]:
         for chap in sorted(path.iterdir())  # iterdir does not guarantee any order
         if chap.is_dir()  # force explosion for readability
     }
+
+
+USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"  # noqa: E501
+
+
+class requests:
+    @staticmethod
+    def get(url: str) -> RealRequests.Response:
+        return RealRequests.get(url, headers={"User-Agent": USER_AGENT})
diff --git a/mandown/sources/base_source.py b/mandown/sources/base_source.py
@@ -1,4 +1,5 @@
 from ..base import BaseChapter, BaseMetadata
+from ..io import requests
 
 
 class BaseSource:
@@ -13,9 +14,21 @@ class BaseSource:
     _metadata: BaseMetadata | None = None
     _chapters: list[BaseChapter] = []
 
+    _scripts: str | None = None
+
     def __init__(self, url: str):
         self.url = url
 
+    def _get_scripts(self) -> str:
+        """
+        Legacy method for fetching the HTML of `self.url`.
+        """
+        if self._scripts:
+            return self._scripts
+
+        self._scripts = requests.get(self.url).text or ""
+        return self._scripts
+
     @property
     def metadata(self) -> BaseMetadata:
         """

diff --git a/mandown/sources/source_batoto.py b/mandown/sources/source_batoto.py
@@ -71,13 +71,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
             images.append(image[1])
         return images
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text or ""
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         items = list(filter(None, url.split("/")))

diff --git a/mandown/sources/source_blogtruyenmoi.py b/mandown/sources/source_blogtruyenmoi.py
@@ -61,13 +61,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
         soup = BeautifulSoup(requests.get(chapter.url).text, "lxml")
         return [el["src"] for el in soup.select("article#content > img")]
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text or ""
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         items = list(filter(None, url.split("/")))

diff --git a/mandown/sources/source_comixextra.py b/mandown/sources/source_comixextra.py
@@ -4,10 +4,10 @@
 
 import re
 
-import requests
 from bs4 import BeautifulSoup
 
 from ..base import BaseChapter, BaseMetadata
+from ..io import requests
 from .base_source import BaseSource
 
 
@@ -58,13 +58,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
 
         return [i["src"] for i in BeautifulSoup(text, "lxml").select(".chapter-container img")]
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text or ""
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         items = list(filter(None, url.split("/")))

diff --git a/mandown/sources/source_kuaikanmanhua.py b/mandown/sources/source_kuaikanmanhua.py
@@ -83,13 +83,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
         strings: list[str] = json.loads(constructed)
         return [s for s in strings if isinstance(s, str)]  # there may be ints
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text or ""
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         items = list(filter(None, url.split("/")))

diff --git a/mandown/sources/source_mangakakalot.py b/mandown/sources/source_mangakakalot.py
@@ -56,13 +56,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
             images.append(i["src"])
         return images
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         *_, last_item = filter(None, url.split("/"))

diff --git a/mandown/sources/source_manganato.py b/mandown/sources/source_manganato.py
@@ -60,13 +60,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
                 images.append(i["src"])
         return images
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         *_, last_item = filter(None, url.split("/"))

diff --git a/mandown/sources/source_mangasee.py b/mandown/sources/source_mangasee.py
@@ -86,13 +86,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
     def check_url(url: str) -> bool:
         return bool(re.match(r"https://mangasee123.com/manga/.*", url))
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         # converts page url to id

diff --git a/mandown/sources/source_manhuaes.py b/mandown/sources/source_manhuaes.py
@@ -66,13 +66,6 @@ def fetch_chapter_image_list(self, chapter: BaseChapter) -> list[str]:
         soup = BeautifulSoup(requests.get(chapter.url).text, "lxml")
         return [el["data-src"] for el in soup.select("img.wp-manga-chapter-img")]
 
-    def _get_scripts(self) -> str:
-        if self._scripts:
-            return self._scripts
-
-        self._scripts = requests.get(self.url).text or ""
-        return self._scripts
-
     @classmethod
     def url_to_id(cls, url: str) -> str:
         items = list(filter(None, url.split("/")))