Trying to avoid issue #40

- Duplicated and modified the URLProvider: - reading source until we reached specified size (/!\ risk of being stuck) - clearly returning an empty Blob at the end (might ave been the reason) - Added new feature to URLItem to not use URLProvided for content under 2MiB
openzim · Feb 10, 2023 · 2c3fe80 · 2c3fe80
1 parent f57adcd
commit 2c3fe80
Show file tree

Hide file tree

Showing 2 changed files with 53 additions and 3 deletions.
diff --git a/kolibri2zim/debug.py b/kolibri2zim/debug.py
@@ -4,10 +4,12 @@
 import re
 import tempfile
 import urllib.parse
+from typing import Optional, Union
 
-from zimscraperlib.download import stream_file
+import libzim.writer
+import requests
+from zimscraperlib.download import stream_file, _get_retry_adapter
 from zimscraperlib.zim.items import StaticItem
-from zimscraperlib.zim.providers import URLProvider
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("DEBUG")
@@ -18,6 +20,46 @@
 bs2 = 1226905
 bss = [bs1, bs2]
 
+session = requests.Session()
+session.mount("http", _get_retry_adapter())
+
+
+class URLProvider(libzim.writer.ContentProvider):
+    """Provider downloading content as it is consumed by the libzim
+
+    Useful for non-indexed content for which feed() is called only once"""
+
+    def __init__(
+        self, url: str, size: Optional[int] = None, ref: Optional[object] = None
+    ):
+        super().__init__()
+        self.url = url
+        self.size = size if size is not None else self.get_size_of(url)
+        self.ref = ref
+
+        self.resp = session.get(url, stream=True)
+        self.resp.raise_for_status()
+
+    @staticmethod
+    def get_size_of(url) -> Union[int, None]:
+        _, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True)
+        try:
+            return int(headers["Content-Length"])
+        except Exception:
+            return None
+
+    def get_size(self) -> int:
+        return self.size
+
+    def gen_blob(self) -> libzim.writer.Blob:  # pragma: nocover
+        read = 0
+        source = self.resp.iter_content(10 * 1024)
+        while read < self.size:
+            data = next(source)
+            read += len(data)
+            yield libzim.writer.Blob(data)
+        yield libzim.writer.Blob(b"")
+
 
 class URLItem(StaticItem):
     """StaticItem to automatically fetch and feed an URL resource
@@ -52,6 +94,7 @@ def __init__(self, url: str, **kwargs):
         super().__init__(**kwargs)
         self.url = urllib.parse.urlparse(url)
         use_disk = getattr(self, "use_disk", False)
+        nostream_threshold = getattr(self, "nostream_threshold", -1)
 
         logger.info(f"> {self.url.geturl()}")
 
@@ -100,6 +143,10 @@ def __init__(self, url: str, **kwargs):
             else:
                 self.fileobj = target
 
+        if self.size and nostream_threshold and self.size <= nostream_threshold:
+            self.fileobj = io.BytesIO()
+            stream_file(self.url.geturl(), byte_stream=self.fileobj)
+
     def get_path(self) -> str:
         return getattr(self, "path", re.sub(r"^/", "", self.url.path))
 

diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
@@ -61,6 +61,7 @@
     "css",
     "dedup_html_files",
 ]
+NOSTREAM_FUNNEL_SIZE = 2**20 * 2  # 2MiB
 
 
 def filename_for(file):
@@ -192,7 +193,9 @@ def funnel_file(self, fid, fext):
         """directly add a Kolibri file to the ZIM using same name"""
         url, fname = get_kolibri_url_for(fid, fext)
         with self.creator_lock:
-            self.creator.add_item(URLItem(url=url, path=fname))
+            self.creator.add_item(
+                URLItem(url=url, path=fname, nostream_threshold=NOSTREAM_FUNNEL_SIZE)
+            )
         logger.debug(f"Added {fname} from Studio")
 
     def download_to_disk(self, file_id, ext):