-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: safely resume interrupted downloads #294
Changes from all commits
3189ac4
4a37b82
2b14f44
b1895f9
c04d8ba
fee2696
1e81eb7
5dea6ab
8b09434
c29efb2
4967a4e
c4591d8
e28d237
d579918
9a04788
188b484
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -86,12 +86,22 @@ def _write_to_stream(self, response): | |
checksum doesn't agree with server-computed checksum. | ||
""" | ||
|
||
# `_get_expected_checksum()` may return None even if a checksum was | ||
# requested, in which case it will emit an info log _MISSING_CHECKSUM. | ||
# If an invalid checksum type is specified, this will raise ValueError. | ||
expected_checksum, checksum_object = _helpers._get_expected_checksum( | ||
response, self._get_headers, self.media_url, checksum_type=self.checksum | ||
) | ||
# Retrieve the expected checksum only once for the download request, | ||
# then compute and validate the checksum when the full download completes. | ||
# Retried requests are range requests, and there's no way to detect | ||
# data corruption for that byte range alone. | ||
if self._expected_checksum is None and self._checksum_object is None: | ||
# `_get_expected_checksum()` may return None even if a checksum was | ||
# requested, in which case it will emit an info log _MISSING_CHECKSUM. | ||
# If an invalid checksum type is specified, this will raise ValueError. | ||
expected_checksum, checksum_object = _helpers._get_expected_checksum( | ||
response, self._get_headers, self.media_url, checksum_type=self.checksum | ||
) | ||
self._expected_checksum = expected_checksum | ||
self._checksum_object = checksum_object | ||
else: | ||
expected_checksum = self._expected_checksum | ||
checksum_object = self._checksum_object | ||
|
||
with response: | ||
# NOTE: In order to handle compressed streams gracefully, we try | ||
|
@@ -104,6 +114,7 @@ def _write_to_stream(self, response): | |
) | ||
for chunk in body_iter: | ||
self._stream.write(chunk) | ||
self._bytes_downloaded += len(chunk) | ||
local_checksum_object.update(chunk) | ||
|
||
if expected_checksum is not None: | ||
|
@@ -150,7 +161,7 @@ def consume( | |
ValueError: If the current :class:`Download` has already | ||
finished. | ||
""" | ||
method, url, payload, headers = self._prepare_request() | ||
method, _, payload, headers = self._prepare_request() | ||
# NOTE: We assume "payload is None" but pass it along anyway. | ||
request_kwargs = { | ||
"data": payload, | ||
|
@@ -160,10 +171,39 @@ def consume( | |
if self._stream is not None: | ||
request_kwargs["stream"] = True | ||
|
||
# Assign object generation if generation is specified in the media url. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this happen via a user specifying a generation on the object? Were we not respecting this previously? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep this would happen via a user specifying a generation on the object. Previously, we've been respecting that only through A property P.S. It's tricky in how limited information is passed from python-storage to resumable-media-python. A resumable-media-python |
||
if self._object_generation is None: | ||
self._object_generation = _helpers._get_generation_from_url(self.media_url) | ||
|
||
# Wrap the request business logic in a function to be retried. | ||
def retriable_request(): | ||
url = self.media_url | ||
|
||
# To restart an interrupted download, read from the offset of last byte | ||
# received using a range request, and set object generation query param. | ||
if self._bytes_downloaded > 0: | ||
_download.add_bytes_range( | ||
self._bytes_downloaded, self.end, self._headers | ||
) | ||
request_kwargs["headers"] = self._headers | ||
|
||
# Set object generation query param to ensure the same object content is requested. | ||
if ( | ||
self._object_generation is not None | ||
and _helpers._get_generation_from_url(self.media_url) is None | ||
): | ||
query_param = {"generation": self._object_generation} | ||
url = _helpers.add_query_parameters(self.media_url, query_param) | ||
|
||
result = transport.request(method, url, **request_kwargs) | ||
|
||
# If a generation hasn't been specified, and this is the first response we get, let's record the | ||
# generation. In future requests we'll specify the generation query param to avoid data races. | ||
if self._object_generation is None: | ||
self._object_generation = _helpers._parse_generation_header( | ||
result, self._get_headers | ||
) | ||
|
||
self._process_response(result) | ||
|
||
if self._stream is not None: | ||
|
@@ -223,20 +263,30 @@ def _write_to_stream(self, response): | |
~google.resumable_media.common.DataCorruption: If the download's | ||
checksum doesn't agree with server-computed checksum. | ||
""" | ||
|
||
# `_get_expected_checksum()` may return None even if a checksum was | ||
# requested, in which case it will emit an info log _MISSING_CHECKSUM. | ||
# If an invalid checksum type is specified, this will raise ValueError. | ||
expected_checksum, checksum_object = _helpers._get_expected_checksum( | ||
response, self._get_headers, self.media_url, checksum_type=self.checksum | ||
) | ||
# Retrieve the expected checksum only once for the download request, | ||
# then compute and validate the checksum when the full download completes. | ||
# Retried requests are range requests, and there's no way to detect | ||
# data corruption for that byte range alone. | ||
if self._expected_checksum is None and self._checksum_object is None: | ||
# `_get_expected_checksum()` may return None even if a checksum was | ||
# requested, in which case it will emit an info log _MISSING_CHECKSUM. | ||
# If an invalid checksum type is specified, this will raise ValueError. | ||
expected_checksum, checksum_object = _helpers._get_expected_checksum( | ||
response, self._get_headers, self.media_url, checksum_type=self.checksum | ||
) | ||
self._expected_checksum = expected_checksum | ||
self._checksum_object = checksum_object | ||
else: | ||
expected_checksum = self._expected_checksum | ||
checksum_object = self._checksum_object | ||
|
||
with response: | ||
body_iter = response.raw.stream( | ||
_request_helpers._SINGLE_GET_CHUNK_SIZE, decode_content=False | ||
) | ||
for chunk in body_iter: | ||
self._stream.write(chunk) | ||
self._bytes_downloaded += len(chunk) | ||
checksum_object.update(chunk) | ||
response._content_consumed = True | ||
|
||
|
@@ -285,19 +335,47 @@ def consume( | |
ValueError: If the current :class:`Download` has already | ||
finished. | ||
""" | ||
method, url, payload, headers = self._prepare_request() | ||
method, _, payload, headers = self._prepare_request() | ||
# NOTE: We assume "payload is None" but pass it along anyway. | ||
request_kwargs = { | ||
"data": payload, | ||
"headers": headers, | ||
"timeout": timeout, | ||
"stream": True, | ||
} | ||
|
||
# Assign object generation if generation is specified in the media url. | ||
if self._object_generation is None: | ||
self._object_generation = _helpers._get_generation_from_url(self.media_url) | ||
|
||
# Wrap the request business logic in a function to be retried. | ||
def retriable_request(): | ||
# NOTE: We assume "payload is None" but pass it along anyway. | ||
result = transport.request( | ||
method, | ||
url, | ||
data=payload, | ||
headers=headers, | ||
stream=True, | ||
timeout=timeout, | ||
) | ||
url = self.media_url | ||
|
||
# To restart an interrupted download, read from the offset of last byte | ||
# received using a range request, and set object generation query param. | ||
if self._bytes_downloaded > 0: | ||
_download.add_bytes_range( | ||
self._bytes_downloaded, self.end, self._headers | ||
) | ||
request_kwargs["headers"] = self._headers | ||
|
||
# Set object generation query param to ensure the same object content is requested. | ||
if ( | ||
self._object_generation is not None | ||
and _helpers._get_generation_from_url(self.media_url) is None | ||
): | ||
query_param = {"generation": self._object_generation} | ||
url = _helpers.add_query_parameters(self.media_url, query_param) | ||
|
||
result = transport.request(method, url, **request_kwargs) | ||
|
||
# If a generation hasn't been specified, and this is the first response we get, let's record the | ||
# generation. In future requests we'll specify the generation query param to avoid data races. | ||
if self._object_generation is None: | ||
self._object_generation = _helpers._parse_generation_header( | ||
result, self._get_headers | ||
) | ||
|
||
self._process_response(result) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What causes this case to happen? Transcoding?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is due to retried requests being range requests. For range requests, as noted here, there's no way to detect data corruption for that byte range alone.
Therefore, here we retrieve the expected checksum/checksum object only once for the initial download request. Then we calculate and validate the checksum when the download completes.