-
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Investigating #40, using a copy of scraperlib's URLItem with verbose details to identify which URL causes the issue - not crashing on resource duplicates (duplicate content in different node IDs) - fixed suceeded boolean that would caused creating ZIM even on exception - [debug] raising first exception - updated scraperlib to 2.0
- Loading branch information
Showing
4 changed files
with
145 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import io | ||
import logging | ||
import pathlib | ||
import re | ||
import tempfile | ||
import urllib.parse | ||
|
||
from zimscraperlib.download import stream_file | ||
from zimscraperlib.zim.items import StaticItem | ||
from zimscraperlib.zim.providers import URLProvider | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
logger = logging.getLogger("DEBUG") | ||
# size[464505] == provider->getSize()[1226905] | ||
# 453.62 KiB 1.17 MiB | ||
|
||
bs1 = 464505 | ||
bs2 = 1226905 | ||
bss = [bs1, bs2] | ||
|
||
|
||
class URLItem(StaticItem): | ||
"""StaticItem to automatically fetch and feed an URL resource | ||
Appropriate for retrieving/bundling static assets that you don't need to | ||
post-process. | ||
Uses URL's path as zim path if none provided | ||
Keeps single in-memory copy of content for HTML resources (indexed) | ||
Works transparently on servers returning a Content-Length header (most) | ||
*Swaps* a copy of the content either in memory or on disk (`use_disk=True`) | ||
in case the content size could not be retrieved from headers. | ||
Use `tmp_dir` to point location of that temp file.""" | ||
|
||
@staticmethod | ||
def download_for_size(url, on_disk, tmp_dir=None): | ||
"""Download URL to a temp file and return its tempfile and size""" | ||
fpath = stream = None | ||
if on_disk: | ||
suffix = pathlib.Path(re.sub(r"^/", "", url.path)).suffix | ||
fpath = pathlib.Path( | ||
tempfile.NamedTemporaryFile( | ||
suffix=suffix, delete=False, dir=tmp_dir | ||
).name | ||
) | ||
else: | ||
stream = io.BytesIO() | ||
size, _ = stream_file(url.geturl(), fpath=fpath, byte_stream=stream) | ||
return fpath or stream, size | ||
|
||
def __init__(self, url: str, **kwargs): | ||
super().__init__(**kwargs) | ||
self.url = urllib.parse.urlparse(url) | ||
use_disk = getattr(self, "use_disk", False) | ||
|
||
logger.info(f"> {self.url.geturl()}") | ||
|
||
# fetch headers to retrieve size and type | ||
try: | ||
_, self.headers = stream_file( | ||
url, byte_stream=io.BytesIO(), only_first_block=True | ||
) | ||
except Exception as exc: | ||
raise IOError(f"Unable to access URL at {url}: {exc}") | ||
|
||
# HTML content will be indexed. | ||
# we proxy the content in the Item to prevent double-download of the resource | ||
# we use a value-variable to prevent race-conditions in the multiple | ||
# reads of the content in the provider | ||
if self.should_index: | ||
self.fileobj = io.BytesIO() | ||
self.size, _ = stream_file(self.url.geturl(), byte_stream=self.fileobj) | ||
logger.info(f"> {self.url.geturl()} SHOULD_INDEX {self.size=}") | ||
if self.size in bss: | ||
logger.error("FOUND {self.url.geturl()} HAS {self.size}") | ||
return | ||
|
||
try: | ||
# Encoded data (compressed) prevents us from using Content-Length header | ||
# as source for the content (it represents length of compressed data) | ||
if self.headers.get("Content-Encoding", "identity") != "identity": | ||
raise ValueError("Can't trust Content-Length for size") | ||
# non-html, non-compressed data. | ||
self.size = int(self.headers["Content-Length"]) | ||
logger.info(f"> {self.url.geturl()} Content-Length {self.size=}") | ||
if self.size in bss: | ||
logger.error("FOUND {self.url.geturl()} HAS {self.size}") | ||
except Exception: | ||
# we couldn't retrieve size so we have to download resource to | ||
target, self.size = self.download_for_size( | ||
self.url, on_disk=use_disk, tmp_dir=getattr(self, "tmp_dir", None) | ||
) | ||
logger.info(f"> {self.url.geturl()} Downloaded {self.size=}") | ||
if self.size in bss: | ||
logger.error("FOUND {self.url.geturl()} HAS {self.size}") | ||
# downloaded to disk and using a file path from now on | ||
if use_disk: | ||
self.filepath = target | ||
# downloaded to RAM and using a bytes object | ||
else: | ||
self.fileobj = target | ||
|
||
def get_path(self) -> str: | ||
return getattr(self, "path", re.sub(r"^/", "", self.url.path)) | ||
|
||
def get_title(self) -> str: | ||
return getattr(self, "title", "") | ||
|
||
def get_mimetype(self) -> str: | ||
return getattr( | ||
self, | ||
"mimetype", | ||
self.headers.get("Content-Type", "application/octet-stream"), | ||
) | ||
|
||
def get_contentprovider(self): | ||
try: | ||
return super().get_contentprovider() | ||
except NotImplementedError: | ||
if not getattr(self, "size", None): | ||
logger.info("> {self.url.geturl()} HAS NO SIZE FOR CP") | ||
return URLProvider( | ||
url=self.url.geturl(), size=getattr(self, "size", None), ref=self | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
zimscraperlib>=1.8.0,<1.9 | ||
zimscraperlib>=2.0.0,<2.1 | ||
kiwixstorage>=0.8.3,<0.9 | ||
jinja2>=3.1.2<3.2 | ||
pif==0.8.2 | ||
|