From 1690f210150c36d54e00727a506d95f697bf6fa5 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 3 Oct 2022 09:31:23 -0400 Subject: [PATCH 1/8] lazier lazy_wheel --- src/pip/_internal/network/lazy_wheel.py | 88 +++++++++++++++++-------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 854a6fa1fdc..67f7cea0a1c 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -1,26 +1,31 @@ """Lazy ZIP over HTTP""" +from __future__ import annotations + __all__ = ["HTTPRangeRequestUnsupported", "dist_from_wheel_url"] +import logging from bisect import bisect_left, bisect_right from contextlib import contextmanager from tempfile import NamedTemporaryFile -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, Iterator from zipfile import BadZipfile, ZipFile from pip._vendor.packaging.utils import canonicalize_name from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution -from pip._internal.network.session import PipSession -from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks +from pip._internal.network.session import PipSession as Session +from pip._internal.network.utils import HEADERS + +log = logging.getLogger(__name__) class HTTPRangeRequestUnsupported(Exception): pass -def dist_from_wheel_url(name: str, url: str, session: PipSession) -> BaseDistribution: +def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistribution: """Return a distribution object from the given wheel URL. This uses HTTP range requests to only fetch the portion of the wheel @@ -47,20 +52,41 @@ class LazyZipOverHTTP: """ def __init__( - self, url: str, session: PipSession, chunk_size: int = CONTENT_CHUNK_SIZE + self, url: str, session: Session, chunk_size: int = CONTENT_CHUNK_SIZE ) -> None: - head = session.head(url, headers=HEADERS) - raise_for_status(head) - assert head.status_code == 200 + + # if CONTENT_CHUNK_SIZE is bigger than the file: + # In [8]: response.headers["Content-Range"] + # Out[8]: 'bytes 0-3133374/3133375' + + self._request_count = 0 + self._session, self._url, self._chunk_size = session, url, chunk_size - self._length = int(head.headers["Content-Length"]) + + # initial range request for the end of the file + tail = self._stream_response(start="", end=CONTENT_CHUNK_SIZE) + # e.g. {'accept-ranges': 'bytes', 'content-length': '10240', + # 'content-range': 'bytes 12824-23063/23064', 'last-modified': 'Sat, 16 + # Apr 2022 13:03:02 GMT', 'date': 'Thu, 21 Apr 2022 11:34:04 GMT'} + + if tail.status_code != 206: + raise HTTPRangeRequestUnsupported("range request is not supported") + + # lowercase content-range to support s3 + self._length = int(tail.headers["content-range"].partition("/")[-1]) self._file = NamedTemporaryFile() self.truncate(self._length) - self._left: List[int] = [] - self._right: List[int] = [] - if "bytes" not in head.headers.get("Accept-Ranges", "none"): - raise HTTPRangeRequestUnsupported("range request is not supported") - self._check_zip() + + # length is also in Content-Length and Content-Range header + with self._stay(): + content_length = int(tail.headers["content-length"]) + if hasattr(tail, "content"): + assert content_length == len(tail.content) + self.seek(self._length - content_length) + for chunk in tail.iter_content(self._chunk_size): + self._file.write(chunk) + self._left: list[int] = [self._length - content_length] + self._right: list[int] = [self._length - 1] @property def mode(self) -> str: @@ -92,7 +118,8 @@ def read(self, size: int = -1) -> bytes: all bytes until EOF are returned. Fewer than size bytes may be returned if EOF is reached. """ - download_size = max(size, self._chunk_size) + # BUG does not download correctly if size is unspecified + download_size = size start, length = self.tell(), self._length stop = length if size < 0 else min(start + download_size, length) start = max(0, stop - download_size) @@ -117,7 +144,7 @@ def tell(self) -> int: """Return the current position.""" return self._file.tell() - def truncate(self, size: Optional[int] = None) -> int: + def truncate(self, size: int | None = None) -> int: """Resize the stream to the given size in bytes. If size is unspecified resize to the current position. @@ -131,15 +158,15 @@ def writable(self) -> bool: """Return False.""" return False - def __enter__(self) -> "LazyZipOverHTTP": + def __enter__(self) -> LazyZipOverHTTP: self._file.__enter__() return self - def __exit__(self, *exc: Any) -> None: - self._file.__exit__(*exc) + def __exit__(self, *exc: Any) -> bool | None: + return self._file.__exit__(*exc) @contextmanager - def _stay(self) -> Generator[None, None, None]: + def _stay(self) -> Iterator[None]: """Return a context manager keeping the position. At the end of the block, seek back to original position. @@ -166,19 +193,27 @@ def _check_zip(self) -> None: break def _stream_response( - self, start: int, end: int, base_headers: Dict[str, str] = HEADERS + self, start: int | str, end: int, base_headers: dict[str, str] = HEADERS ) -> Response: - """Return HTTP response to a range request from start to end.""" + """Return HTTP response to a range request from start to end. + + :param start: if "", request ``end` bytes from end of file.""" headers = base_headers.copy() headers["Range"] = f"bytes={start}-{end}" + log.debug("%s", headers["Range"]) # TODO: Get range requests to be correctly cached headers["Cache-Control"] = "no-cache" - return self._session.get(self._url, headers=headers, stream=True) + # TODO: If-Match (etag) to detect file changed during fetch would be a + # good addition to HEADERS + self._request_count += 1 + response = self._session.get(self._url, headers=headers, stream=True) + response.raise_for_status() + return response def _merge( self, start: int, end: int, left: int, right: int - ) -> Generator[Tuple[int, int], None, None]: - """Return a generator of intervals to be fetched. + ) -> Iterator[tuple[int, int]]: + """Return an iterator of intervals to be fetched. Args: start (int): Start of needed interval @@ -204,7 +239,6 @@ def _download(self, start: int, end: int) -> None: right = bisect_right(self._left, end) for start, end in self._merge(start, end, left, right): response = self._stream_response(start, end) - response.raise_for_status() self.seek(start) - for chunk in response_chunks(response, self._chunk_size): + for chunk in response.iter_content(self._chunk_size): self._file.write(chunk) From 187fcd9a290759401073342ea9ba32cb761af149 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 3 Oct 2022 09:52:04 -0400 Subject: [PATCH 2/8] add news file --- news/11481.feature.rst | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 news/11481.feature.rst diff --git a/news/11481.feature.rst b/news/11481.feature.rst new file mode 100644 index 00000000000..0b38d1554c0 --- /dev/null +++ b/news/11481.feature.rst @@ -0,0 +1,2 @@ +Implement lazier lazy_wheel that avoids the HEAD request, fetching the metadata +for many wheels in 1 request. \ No newline at end of file From 974f4332a4924fc8fb48b2672f056de6b4510424 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 3 Oct 2022 09:53:49 -0400 Subject: [PATCH 3/8] type: ignore --- src/pip/_internal/network/lazy_wheel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 67f7cea0a1c..d63896d13af 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -163,7 +163,7 @@ def __enter__(self) -> LazyZipOverHTTP: return self def __exit__(self, *exc: Any) -> bool | None: - return self._file.__exit__(*exc) + return self._file.__exit__(*exc) # type: ignore @contextmanager def _stay(self) -> Iterator[None]: From 934fdc494de84d1ffe1db5cb0ccbf1fa9e222bea Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 3 Oct 2022 09:56:31 -0400 Subject: [PATCH 4/8] newline --- news/11481.feature.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/news/11481.feature.rst b/news/11481.feature.rst index 0b38d1554c0..a4c298505a5 100644 --- a/news/11481.feature.rst +++ b/news/11481.feature.rst @@ -1,2 +1,2 @@ Implement lazier lazy_wheel that avoids the HEAD request, fetching the metadata -for many wheels in 1 request. \ No newline at end of file +for many wheels in 1 request. From 5929c8ba78b46bfd0f82b3d06f9760a07deb6c64 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Tue, 4 Oct 2022 11:21:54 -0400 Subject: [PATCH 5/8] handle short files `416`; prefetch entire dist-info --- src/pip/_internal/network/lazy_wheel.py | 79 ++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index d63896d13af..5bb21e5730c 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -12,7 +12,7 @@ from zipfile import BadZipfile, ZipFile from pip._vendor.packaging.utils import canonicalize_name -from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response +from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response, HTTPError from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution from pip._internal.network.session import PipSession as Session @@ -34,6 +34,8 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi is raised. """ with LazyZipOverHTTP(url, session) as zf: + zf.prefetch_dist_info() + # For read-only ZIP files, ZipFile only needs methods read, # seek, seekable and tell, not the whole IO protocol. wheel = MemoryWheel(zf.name, zf) # type: ignore @@ -64,13 +66,30 @@ def __init__( self._session, self._url, self._chunk_size = session, url, chunk_size # initial range request for the end of the file - tail = self._stream_response(start="", end=CONTENT_CHUNK_SIZE) + try: + tail = self._stream_response(start="", end=CONTENT_CHUNK_SIZE) + except HTTPError as e: + if e.response.status_code != 416: + raise + + # The 416 response message contains a Content-Range indicating an + # unsatisfied range (that is a '*') followed by a '/' and the current + # length of the resource. E.g. Content-Range: bytes */12777 + content_length = int(e.response.headers['content-range'].rsplit('/', 1)[-1]) + tail = self._stream_response(start=0, end=content_length) + # e.g. {'accept-ranges': 'bytes', 'content-length': '10240', # 'content-range': 'bytes 12824-23063/23064', 'last-modified': 'Sat, 16 # Apr 2022 13:03:02 GMT', 'date': 'Thu, 21 Apr 2022 11:34:04 GMT'} + if tail.status_code != 206: - raise HTTPRangeRequestUnsupported("range request is not supported") + if tail.status_code == 200 and int(tail.headers['content-length']) <= CONTENT_CHUNK_SIZE: + # small file + content_length = len(tail.content) + tail.headers["content-range"] = f"0-{content_length-1}/{content_length}" + else: + raise HTTPRangeRequestUnsupported("range request is not supported") # lowercase content-range to support s3 self._length = int(tail.headers["content-range"].partition("/")[-1]) @@ -163,6 +182,7 @@ def __enter__(self) -> LazyZipOverHTTP: return self def __exit__(self, *exc: Any) -> bool | None: + print(self._request_count, 'requests to fetch metadata from', self._url[107:]) return self._file.__exit__(*exc) # type: ignore @contextmanager @@ -242,3 +262,56 @@ def _download(self, start: int, end: int) -> None: self.seek(start) for chunk in response.iter_content(self._chunk_size): self._file.write(chunk) + + def prefetch(self, target_file): + """ + Prefetch a specific file from the remote ZIP in one request. + """ + with self._stay(): # not strictly necessary + # try to read entire conda info in one request + zf = ZipFile(self) + infolist = zf.infolist() + for i, info in enumerate(infolist): + if info.filename == target_file: + # could be incorrect if zipfile was concatenated to another + # file (not likely for .conda) + start = info.header_offset + try: + end = infolist[i + 1].header_offset + # or info.header_offset + # + len(info.filename) + # + len(info.extra) + # + info.compress_size + # (unless Zip64) + except IndexError: + end = zf.start_dir + self.seek(start) + self.read(end - start) + log.debug( + "prefetch %s-%s", + info.header_offset, + end, + ) + break + else: + log.debug("no zip prefetch") + + def prefetch_dist_info(self): + """ + Read contents of entire dist-info section of wheel. + + pip wants to read WHEEL and METADATA. + """ + print("prefetch dist-info begin") + with self._stay(): + zf = ZipFile(self) + infolist = zf.infolist() + for i, info in enumerate(infolist): + # should be (wheel filename without extension etc) + (.dist-info/) + if ".dist-info/" in info.filename: + start = info.header_offset + end = zf.start_dir + self.seek(start) + print(f"prefetch dist-info {start}-{end}") + self.read(end-start) + break \ No newline at end of file From f30b9c101a4be23220a876d1af6e5379a786a7d8 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Tue, 4 Oct 2022 11:22:11 -0400 Subject: [PATCH 6/8] format --- src/pip/_internal/network/lazy_wheel.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 5bb21e5730c..d3c308ba83c 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -75,16 +75,18 @@ def __init__( # The 416 response message contains a Content-Range indicating an # unsatisfied range (that is a '*') followed by a '/' and the current # length of the resource. E.g. Content-Range: bytes */12777 - content_length = int(e.response.headers['content-range'].rsplit('/', 1)[-1]) + content_length = int(e.response.headers["content-range"].rsplit("/", 1)[-1]) tail = self._stream_response(start=0, end=content_length) # e.g. {'accept-ranges': 'bytes', 'content-length': '10240', # 'content-range': 'bytes 12824-23063/23064', 'last-modified': 'Sat, 16 # Apr 2022 13:03:02 GMT', 'date': 'Thu, 21 Apr 2022 11:34:04 GMT'} - if tail.status_code != 206: - if tail.status_code == 200 and int(tail.headers['content-length']) <= CONTENT_CHUNK_SIZE: + if ( + tail.status_code == 200 + and int(tail.headers["content-length"]) <= CONTENT_CHUNK_SIZE + ): # small file content_length = len(tail.content) tail.headers["content-range"] = f"0-{content_length-1}/{content_length}" @@ -182,7 +184,7 @@ def __enter__(self) -> LazyZipOverHTTP: return self def __exit__(self, *exc: Any) -> bool | None: - print(self._request_count, 'requests to fetch metadata from', self._url[107:]) + print(self._request_count, "requests to fetch metadata from", self._url[107:]) return self._file.__exit__(*exc) # type: ignore @contextmanager @@ -313,5 +315,5 @@ def prefetch_dist_info(self): end = zf.start_dir self.seek(start) print(f"prefetch dist-info {start}-{end}") - self.read(end-start) - break \ No newline at end of file + self.read(end - start) + break From 1e58459ae49c4225c38fcd938095351ac60094c0 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Wed, 19 Oct 2022 13:35:30 -0400 Subject: [PATCH 7/8] lazy_wheel: update method types --- src/pip/_internal/network/lazy_wheel.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index d3c308ba83c..732cdb630d6 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -12,7 +12,7 @@ from zipfile import BadZipfile, ZipFile from pip._vendor.packaging.utils import canonicalize_name -from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response, HTTPError +from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, HTTPError, Response from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution from pip._internal.network.session import PipSession as Session @@ -265,13 +265,13 @@ def _download(self, start: int, end: int) -> None: for chunk in response.iter_content(self._chunk_size): self._file.write(chunk) - def prefetch(self, target_file): + def prefetch(self, target_file: str) -> None: """ Prefetch a specific file from the remote ZIP in one request. """ with self._stay(): # not strictly necessary # try to read entire conda info in one request - zf = ZipFile(self) + zf = ZipFile(self) # type: ignore infolist = zf.infolist() for i, info in enumerate(infolist): if info.filename == target_file: @@ -298,22 +298,20 @@ def prefetch(self, target_file): else: log.debug("no zip prefetch") - def prefetch_dist_info(self): + def prefetch_dist_info(self) -> None: """ Read contents of entire dist-info section of wheel. pip wants to read WHEEL and METADATA. """ - print("prefetch dist-info begin") with self._stay(): - zf = ZipFile(self) + zf = ZipFile(self) # type: ignore infolist = zf.infolist() - for i, info in enumerate(infolist): + for info in infolist: # should be (wheel filename without extension etc) + (.dist-info/) if ".dist-info/" in info.filename: start = info.header_offset end = zf.start_dir self.seek(start) - print(f"prefetch dist-info {start}-{end}") self.read(end - start) break From 525814bd3aaaf7399ce00a64e384a441892fb61a Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Wed, 19 Oct 2022 14:31:33 -0400 Subject: [PATCH 8/8] lazy_wheel: translate BadZipfile to InvalidWheel --- src/pip/_internal/network/lazy_wheel.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 732cdb630d6..f3a9d9bba26 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -14,6 +14,7 @@ from pip._vendor.packaging.utils import canonicalize_name from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, HTTPError, Response +from pip._internal.exceptions import InvalidWheel from pip._internal.metadata import BaseDistribution, MemoryWheel, get_wheel_distribution from pip._internal.network.session import PipSession as Session from pip._internal.network.utils import HEADERS @@ -33,15 +34,18 @@ def dist_from_wheel_url(name: str, url: str, session: Session) -> BaseDistributi If such requests are not supported, HTTPRangeRequestUnsupported is raised. """ - with LazyZipOverHTTP(url, session) as zf: - zf.prefetch_dist_info() - - # For read-only ZIP files, ZipFile only needs methods read, - # seek, seekable and tell, not the whole IO protocol. - wheel = MemoryWheel(zf.name, zf) # type: ignore - # After context manager exit, wheel.name - # is an invalid file by intention. - return get_wheel_distribution(wheel, canonicalize_name(name)) + try: + with LazyZipOverHTTP(url, session) as zf: + zf.prefetch_dist_info() + + # For read-only ZIP files, ZipFile only needs methods read, + # seek, seekable and tell, not the whole IO protocol. + wheel = MemoryWheel(zf.name, zf) # type: ignore + # After context manager exit, wheel.name + # is an invalid file by intention. + return get_wheel_distribution(wheel, canonicalize_name(name)) + except BadZipfile: + raise InvalidWheel(url, name) class LazyZipOverHTTP: