diff --git a/kolibri2zim/debug.py b/kolibri2zim/debug.py index d91ab5d..0c513a2 100644 --- a/kolibri2zim/debug.py +++ b/kolibri2zim/debug.py @@ -4,10 +4,12 @@ import re import tempfile import urllib.parse +from typing import Optional, Union -from zimscraperlib.download import stream_file +import libzim.writer +import requests +from zimscraperlib.download import stream_file, _get_retry_adapter from zimscraperlib.zim.items import StaticItem -from zimscraperlib.zim.providers import URLProvider logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("DEBUG") @@ -18,6 +20,46 @@ bs2 = 1226905 bss = [bs1, bs2] +session = requests.Session() +session.mount("http", _get_retry_adapter()) + + +class URLProvider(libzim.writer.ContentProvider): + """Provider downloading content as it is consumed by the libzim + + Useful for non-indexed content for which feed() is called only once""" + + def __init__( + self, url: str, size: Optional[int] = None, ref: Optional[object] = None + ): + super().__init__() + self.url = url + self.size = size if size is not None else self.get_size_of(url) + self.ref = ref + + self.resp = session.get(url, stream=True) + self.resp.raise_for_status() + + @staticmethod + def get_size_of(url) -> Union[int, None]: + _, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True) + try: + return int(headers["Content-Length"]) + except Exception: + return None + + def get_size(self) -> int: + return self.size + + def gen_blob(self) -> libzim.writer.Blob: # pragma: nocover + read = 0 + source = self.resp.iter_content(10 * 1024) + while read < self.size: + data = next(source) + read += len(data) + yield libzim.writer.Blob(data) + yield libzim.writer.Blob(b"") + class URLItem(StaticItem): """StaticItem to automatically fetch and feed an URL resource @@ -52,6 +94,7 @@ def __init__(self, url: str, **kwargs): super().__init__(**kwargs) self.url = urllib.parse.urlparse(url) use_disk = getattr(self, "use_disk", False) + nostream_threshold = getattr(self, "nostream_threshold", -1) logger.info(f"> {self.url.geturl()}") @@ -100,6 +143,10 @@ def __init__(self, url: str, **kwargs): else: self.fileobj = target + if self.size and nostream_threshold and self.size <= nostream_threshold: + self.fileobj = io.BytesIO() + stream_file(self.url.geturl(), byte_stream=self.fileobj) + def get_path(self) -> str: return getattr(self, "path", re.sub(r"^/", "", self.url.path)) diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py index 7f9aac4..4bdb6fd 100644 --- a/kolibri2zim/scraper.py +++ b/kolibri2zim/scraper.py @@ -61,6 +61,7 @@ "css", "dedup_html_files", ] +NOSTREAM_FUNNEL_SIZE = 2**20 * 2 # 2MiB def filename_for(file): @@ -192,7 +193,9 @@ def funnel_file(self, fid, fext): """directly add a Kolibri file to the ZIM using same name""" url, fname = get_kolibri_url_for(fid, fext) with self.creator_lock: - self.creator.add_item(URLItem(url=url, path=fname)) + self.creator.add_item( + URLItem(url=url, path=fname, nostream_threshold=NOSTREAM_FUNNEL_SIZE) + ) logger.debug(f"Added {fname} from Studio") def download_to_disk(self, file_id, ext):