Skip to content

Commit

Permalink
Testing URLItem with verbose debug
Browse files Browse the repository at this point in the history
- Investigating #40, using a copy of scraperlib's URLItem with verbose details
to identify which URL causes the issue

- not crashing on resource duplicates (duplicate content in different node IDs)
- fixed suceeded boolean that would caused creating ZIM even on exception

- [debug] raising first exception

- updated scraperlib to 2.0
  • Loading branch information
rgaudin committed Feb 9, 2023
1 parent 869e11d commit f57adcd
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 4 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed

- Using zimscraperlib 1.8, creating libzim7 ZIM files
- Using zimscraperlib 2.0, creating libzim8 ZIM files
- Allowing duplicated resources at different paths
- fixed suceeded status in case an exception was raised
- Updated dependencies (beautifulsoup4, Jinja2)

## [1.0.0] - 2021-11-11
Expand Down
124 changes: 124 additions & 0 deletions kolibri2zim/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import io
import logging
import pathlib
import re
import tempfile
import urllib.parse

from zimscraperlib.download import stream_file
from zimscraperlib.zim.items import StaticItem
from zimscraperlib.zim.providers import URLProvider

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("DEBUG")
# size[464505] == provider->getSize()[1226905]
# 453.62 KiB 1.17 MiB

bs1 = 464505
bs2 = 1226905
bss = [bs1, bs2]


class URLItem(StaticItem):
"""StaticItem to automatically fetch and feed an URL resource
Appropriate for retrieving/bundling static assets that you don't need to
post-process.
Uses URL's path as zim path if none provided
Keeps single in-memory copy of content for HTML resources (indexed)
Works transparently on servers returning a Content-Length header (most)
*Swaps* a copy of the content either in memory or on disk (`use_disk=True`)
in case the content size could not be retrieved from headers.
Use `tmp_dir` to point location of that temp file."""

@staticmethod
def download_for_size(url, on_disk, tmp_dir=None):
"""Download URL to a temp file and return its tempfile and size"""
fpath = stream = None
if on_disk:
suffix = pathlib.Path(re.sub(r"^/", "", url.path)).suffix
fpath = pathlib.Path(
tempfile.NamedTemporaryFile(
suffix=suffix, delete=False, dir=tmp_dir
).name
)
else:
stream = io.BytesIO()
size, _ = stream_file(url.geturl(), fpath=fpath, byte_stream=stream)
return fpath or stream, size

def __init__(self, url: str, **kwargs):
super().__init__(**kwargs)
self.url = urllib.parse.urlparse(url)
use_disk = getattr(self, "use_disk", False)

logger.info(f"> {self.url.geturl()}")

# fetch headers to retrieve size and type
try:
_, self.headers = stream_file(
url, byte_stream=io.BytesIO(), only_first_block=True
)
except Exception as exc:
raise IOError(f"Unable to access URL at {url}: {exc}")

# HTML content will be indexed.
# we proxy the content in the Item to prevent double-download of the resource
# we use a value-variable to prevent race-conditions in the multiple
# reads of the content in the provider
if self.should_index:
self.fileobj = io.BytesIO()
self.size, _ = stream_file(self.url.geturl(), byte_stream=self.fileobj)
logger.info(f"> {self.url.geturl()} SHOULD_INDEX {self.size=}")
if self.size in bss:
logger.error("FOUND {self.url.geturl()} HAS {self.size}")
return

try:
# Encoded data (compressed) prevents us from using Content-Length header
# as source for the content (it represents length of compressed data)
if self.headers.get("Content-Encoding", "identity") != "identity":
raise ValueError("Can't trust Content-Length for size")
# non-html, non-compressed data.
self.size = int(self.headers["Content-Length"])
logger.info(f"> {self.url.geturl()} Content-Length {self.size=}")
if self.size in bss:
logger.error("FOUND {self.url.geturl()} HAS {self.size}")
except Exception:
# we couldn't retrieve size so we have to download resource to
target, self.size = self.download_for_size(
self.url, on_disk=use_disk, tmp_dir=getattr(self, "tmp_dir", None)
)
logger.info(f"> {self.url.geturl()} Downloaded {self.size=}")
if self.size in bss:
logger.error("FOUND {self.url.geturl()} HAS {self.size}")
# downloaded to disk and using a file path from now on
if use_disk:
self.filepath = target
# downloaded to RAM and using a bytes object
else:
self.fileobj = target

def get_path(self) -> str:
return getattr(self, "path", re.sub(r"^/", "", self.url.path))

def get_title(self) -> str:
return getattr(self, "title", "")

def get_mimetype(self) -> str:
return getattr(
self,
"mimetype",
self.headers.get("Content-Type", "application/octet-stream"),
)

def get_contentprovider(self):
try:
return super().get_contentprovider()
except NotImplementedError:
if not getattr(self, "size", None):
logger.info("> {self.url.geturl()} HAS NO SIZE FOR CP")
return URLProvider(
url=self.url.geturl(), size=getattr(self, "size", None), ref=self
)
19 changes: 17 additions & 2 deletions kolibri2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from kiwixstorage import KiwixStorage
from zimscraperlib.download import stream_file
from zimscraperlib.zim.creator import Creator
from zimscraperlib.zim.items import URLItem, StaticItem
from zimscraperlib.zim.items import StaticItem
from zimscraperlib.i18n import find_language_names
from zimscraperlib.inputs import handle_user_provided_file
from zimscraperlib.image.convertion import convert_image, create_favicon
Expand All @@ -32,6 +32,7 @@

from .constants import ROOT_DIR, getLogger, STUDIO_URL
from .database import KolibriDB
from .debug import URLItem

logger = getLogger()
options = [
Expand Down Expand Up @@ -397,6 +398,7 @@ def add_video_node(self, node_id):
content=html,
mimetype="text/html",
)
logger.debug(f"Added video #{node_id}")

def add_video_upon_completion(self, future):
"""adds the converted video inside this future to the zim
Expand Down Expand Up @@ -739,6 +741,7 @@ def run(self):
self.creator = Creator(
filename=self.output_dir.joinpath(self.fname),
main_path=self.root_id,
ignore_duplicates=True,
language="eng",
title=self.title,
description=self.description,
Expand Down Expand Up @@ -780,7 +783,10 @@ def run(self):
# only awaits future completion and doesn't include callbacks
self.videos_executor.shutdown()

succeeded = not result.not_done
succeeded = (
not result.not_done
and sum([1 if fs.exception() else 0 for fs in result.done]) == 0
)
except KeyboardInterrupt:
self.creator.can_finish = False
logger.error("KeyboardInterrupt, exiting.")
Expand All @@ -792,6 +798,15 @@ def run(self):
finally:
if succeeded:
logger.info("Finishing ZIM file…")
else:
# DEBUG: raise first exception
logger.info(
f"FAILURE not_done={len(result.not_done)} done={len(result.done)}"
)
if result.done:
for future in result.done:
if future.exception():
raise future.exception()
# we need to release libzim's resources.
# currently does nothing but crash if can_finish=False but that's awaiting
# impl. at libkiwix level
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
zimscraperlib>=1.8.0,<1.9
zimscraperlib>=2.0.0,<2.1
kiwixstorage>=0.8.3,<0.9
jinja2>=3.1.2<3.2
pif==0.8.2
Expand Down

0 comments on commit f57adcd

Please sign in to comment.