Skip to content

Commit

Permalink
Trying to avoid issue #40
Browse files Browse the repository at this point in the history
- Duplicated and modified the URLProvider:
  - reading source until we reached specified size (/!\ risk of being stuck)
  - clearly returning an empty Blob at the end (might ave been the reason)
- Added new feature to URLItem to not use URLProvided for content under 2MiB
  • Loading branch information
rgaudin committed Feb 10, 2023
1 parent f57adcd commit 2c3fe80
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 3 deletions.
51 changes: 49 additions & 2 deletions kolibri2zim/debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import re
import tempfile
import urllib.parse
from typing import Optional, Union

from zimscraperlib.download import stream_file
import libzim.writer
import requests
from zimscraperlib.download import stream_file, _get_retry_adapter
from zimscraperlib.zim.items import StaticItem
from zimscraperlib.zim.providers import URLProvider

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("DEBUG")
Expand All @@ -18,6 +20,46 @@
bs2 = 1226905
bss = [bs1, bs2]

session = requests.Session()
session.mount("http", _get_retry_adapter())


class URLProvider(libzim.writer.ContentProvider):
"""Provider downloading content as it is consumed by the libzim
Useful for non-indexed content for which feed() is called only once"""

def __init__(
self, url: str, size: Optional[int] = None, ref: Optional[object] = None
):
super().__init__()
self.url = url
self.size = size if size is not None else self.get_size_of(url)
self.ref = ref

self.resp = session.get(url, stream=True)
self.resp.raise_for_status()

@staticmethod
def get_size_of(url) -> Union[int, None]:
_, headers = stream_file(url, byte_stream=io.BytesIO(), only_first_block=True)
try:
return int(headers["Content-Length"])
except Exception:
return None

def get_size(self) -> int:
return self.size

def gen_blob(self) -> libzim.writer.Blob: # pragma: nocover
read = 0
source = self.resp.iter_content(10 * 1024)
while read < self.size:
data = next(source)
read += len(data)
yield libzim.writer.Blob(data)
yield libzim.writer.Blob(b"")


class URLItem(StaticItem):
"""StaticItem to automatically fetch and feed an URL resource
Expand Down Expand Up @@ -52,6 +94,7 @@ def __init__(self, url: str, **kwargs):
super().__init__(**kwargs)
self.url = urllib.parse.urlparse(url)
use_disk = getattr(self, "use_disk", False)
nostream_threshold = getattr(self, "nostream_threshold", -1)

logger.info(f"> {self.url.geturl()}")

Expand Down Expand Up @@ -100,6 +143,10 @@ def __init__(self, url: str, **kwargs):
else:
self.fileobj = target

if self.size and nostream_threshold and self.size <= nostream_threshold:
self.fileobj = io.BytesIO()
stream_file(self.url.geturl(), byte_stream=self.fileobj)

def get_path(self) -> str:
return getattr(self, "path", re.sub(r"^/", "", self.url.path))

Expand Down
5 changes: 4 additions & 1 deletion kolibri2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"css",
"dedup_html_files",
]
NOSTREAM_FUNNEL_SIZE = 2**20 * 2 # 2MiB


def filename_for(file):
Expand Down Expand Up @@ -192,7 +193,9 @@ def funnel_file(self, fid, fext):
"""directly add a Kolibri file to the ZIM using same name"""
url, fname = get_kolibri_url_for(fid, fext)
with self.creator_lock:
self.creator.add_item(URLItem(url=url, path=fname))
self.creator.add_item(
URLItem(url=url, path=fname, nostream_threshold=NOSTREAM_FUNNEL_SIZE)
)
logger.debug(f"Added {fname} from Studio")

def download_to_disk(self, file_id, ext):
Expand Down

0 comments on commit 2c3fe80

Please sign in to comment.