Testing URLItem with verbose debug

- Investigating #40, using a copy of scraperlib's URLItem with verbose details to identify which URL causes the issue - not crashing on resource duplicates (duplicate content in different node IDs) - fixed suceeded boolean that would caused creating ZIM even on exception - [debug] raising first exception - updated scraperlib to 2.0
openzim · Feb 9, 2023 · f57adcd · f57adcd
1 parent 869e11d
commit f57adcd
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,7 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- Using zimscraperlib 1.8, creating libzim7 ZIM files
+- Using zimscraperlib 2.0, creating libzim8 ZIM files
+- Allowing duplicated resources at different paths
+- fixed suceeded status in case an exception was raised
 - Updated dependencies (beautifulsoup4, Jinja2)
 
 ## [1.0.0] - 2021-11-11

diff --git a/kolibri2zim/debug.py b/kolibri2zim/debug.py
@@ -0,0 +1,124 @@
+import io
+import logging
+import pathlib
+import re
+import tempfile
+import urllib.parse
+
+from zimscraperlib.download import stream_file
+from zimscraperlib.zim.items import StaticItem
+from zimscraperlib.zim.providers import URLProvider
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger("DEBUG")
+# size[464505] == provider->getSize()[1226905]
+# 453.62 KiB      1.17 MiB
+
+bs1 = 464505
+bs2 = 1226905
+bss = [bs1, bs2]
+
+
+class URLItem(StaticItem):
+    """StaticItem to automatically fetch and feed an URL resource
+
+    Appropriate for retrieving/bundling static assets that you don't need to
+    post-process.
+
+    Uses URL's path as zim path if none provided
+    Keeps single in-memory copy of content for HTML resources (indexed)
+    Works transparently on servers returning a Content-Length header (most)
+    *Swaps* a copy of the content either in memory or on disk (`use_disk=True`)
+    in case the content size could not be retrieved from headers.
+    Use `tmp_dir` to point location of that temp file."""
+
+    @staticmethod
+    def download_for_size(url, on_disk, tmp_dir=None):
+        """Download URL to a temp file and return its tempfile and size"""
+        fpath = stream = None
+        if on_disk:
+            suffix = pathlib.Path(re.sub(r"^/", "", url.path)).suffix
+            fpath = pathlib.Path(
+                tempfile.NamedTemporaryFile(
+                    suffix=suffix, delete=False, dir=tmp_dir
+                ).name
+            )
+        else:
+            stream = io.BytesIO()
+        size, _ = stream_file(url.geturl(), fpath=fpath, byte_stream=stream)
+        return fpath or stream, size
+
+    def __init__(self, url: str, **kwargs):
+        super().__init__(**kwargs)
+        self.url = urllib.parse.urlparse(url)
+        use_disk = getattr(self, "use_disk", False)
+
+        logger.info(f"> {self.url.geturl()}")
+
+        # fetch headers to retrieve size and type
+        try:
+            _, self.headers = stream_file(
+                url, byte_stream=io.BytesIO(), only_first_block=True
+            )
+        except Exception as exc:
+            raise IOError(f"Unable to access URL at {url}: {exc}")
+
+        # HTML content will be indexed.
+        # we proxy the content in the Item to prevent double-download of the resource
+        # we use a value-variable to prevent race-conditions in the multiple
+        # reads of the content in the provider
+        if self.should_index:
+            self.fileobj = io.BytesIO()
+            self.size, _ = stream_file(self.url.geturl(), byte_stream=self.fileobj)
+            logger.info(f"> {self.url.geturl()} SHOULD_INDEX {self.size=}")
+            if self.size in bss:
+                logger.error("FOUND {self.url.geturl()} HAS {self.size}")
+            return
+
+        try:
+            # Encoded data (compressed) prevents us from using Content-Length header
+            # as source for the content (it represents length of compressed data)
+            if self.headers.get("Content-Encoding", "identity") != "identity":
+                raise ValueError("Can't trust Content-Length for size")
+            # non-html, non-compressed data.
+            self.size = int(self.headers["Content-Length"])
+            logger.info(f"> {self.url.geturl()} Content-Length {self.size=}")
+            if self.size in bss:
+                logger.error("FOUND {self.url.geturl()} HAS {self.size}")
+        except Exception:
+            # we couldn't retrieve size so we have to download resource to
+            target, self.size = self.download_for_size(
+                self.url, on_disk=use_disk, tmp_dir=getattr(self, "tmp_dir", None)
+            )
+            logger.info(f"> {self.url.geturl()} Downloaded {self.size=}")
+            if self.size in bss:
+                logger.error("FOUND {self.url.geturl()} HAS {self.size}")
+            # downloaded to disk and using a file path from now on
+            if use_disk:
+                self.filepath = target
+            # downloaded to RAM and using a bytes object
+            else:
+                self.fileobj = target
+
+    def get_path(self) -> str:
+        return getattr(self, "path", re.sub(r"^/", "", self.url.path))
+
+    def get_title(self) -> str:
+        return getattr(self, "title", "")
+
+    def get_mimetype(self) -> str:
+        return getattr(
+            self,
+            "mimetype",
+            self.headers.get("Content-Type", "application/octet-stream"),
+        )
+
+    def get_contentprovider(self):
+        try:
+            return super().get_contentprovider()
+        except NotImplementedError:
+            if not getattr(self, "size", None):
+                logger.info("> {self.url.geturl()} HAS NO SIZE FOR CP")
+            return URLProvider(
+                url=self.url.geturl(), size=getattr(self, "size", None), ref=self
+            )
diff --git a/kolibri2zim/scraper.py b/kolibri2zim/scraper.py
@@ -21,7 +21,7 @@
 from kiwixstorage import KiwixStorage
 from zimscraperlib.download import stream_file
 from zimscraperlib.zim.creator import Creator
-from zimscraperlib.zim.items import URLItem, StaticItem
+from zimscraperlib.zim.items import StaticItem
 from zimscraperlib.i18n import find_language_names
 from zimscraperlib.inputs import handle_user_provided_file
 from zimscraperlib.image.convertion import convert_image, create_favicon
@@ -32,6 +32,7 @@
 
 from .constants import ROOT_DIR, getLogger, STUDIO_URL
 from .database import KolibriDB
+from .debug import URLItem
 
 logger = getLogger()
 options = [
@@ -397,6 +398,7 @@ def add_video_node(self, node_id):
                 content=html,
                 mimetype="text/html",
             )
+        logger.debug(f"Added video #{node_id}")
 
     def add_video_upon_completion(self, future):
         """adds the converted video inside this future to the zim
@@ -739,6 +741,7 @@ def run(self):
         self.creator = Creator(
             filename=self.output_dir.joinpath(self.fname),
             main_path=self.root_id,
+            ignore_duplicates=True,
             language="eng",
             title=self.title,
             description=self.description,
@@ -780,7 +783,10 @@ def run(self):
             # only awaits future completion and doesn't include callbacks
             self.videos_executor.shutdown()
 
-            succeeded = not result.not_done
+            succeeded = (
+                not result.not_done
+                and sum([1 if fs.exception() else 0 for fs in result.done]) == 0
+            )
         except KeyboardInterrupt:
             self.creator.can_finish = False
             logger.error("KeyboardInterrupt, exiting.")
@@ -792,6 +798,15 @@ def run(self):
         finally:
             if succeeded:
                 logger.info("Finishing ZIM file…")
+            else:
+                # DEBUG: raise first exception
+                logger.info(
+                    f"FAILURE not_done={len(result.not_done)} done={len(result.done)}"
+                )
+                if result.done:
+                    for future in result.done:
+                        if future.exception():
+                            raise future.exception()
             # we need to release libzim's resources.
             # currently does nothing but crash if can_finish=False but that's awaiting
             # impl. at libkiwix level

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-zimscraperlib>=1.8.0,<1.9
+zimscraperlib>=2.0.0,<2.1
 kiwixstorage>=0.8.3,<0.9
 jinja2>=3.1.2<3.2
 pif==0.8.2