Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed content-handler response-headers/object-storage collision. #4251

Merged
merged 1 commit into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES/4028.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Taught downloader to correctly handle plugin-specified headers for object-storage backends.
37 changes: 37 additions & 0 deletions pulpcore/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,40 @@
)

EXPORT_BATCH_SIZE = 2000

# Mapping of http-response-headers to what various block-storage-apis call them
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3/client/get_object.html
# response-headers S3 respects, and what they map to in an S3 object
S3_RESPONSE_HEADER_MAP = {
"Content-Disposition": "ResponseContentDisposition",
"Content-Type": "ResponseContentType",
"Cache-Control": "ResponseCacheControl",
"Content-Language": "ResponseContentLanguage",
"Expires": "ResponseExpires",
"Content-Encoding": "ResponseContentEncoding",
}
# https://learn.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.contentsettings?view=azure-python
# response-headers azure respects, and what they map to in an azure object
AZURE_RESPONSE_HEADER_MAP = {
"Content-Disposition": "content_disposition",
"Content-Type": "content_type",
"Cache-Control": "cache_control",
"Content-Language": "content_language",
"Content-Encoding": "content_encoding",
}
# https://gcloud.readthedocs.io/en/latest/storage-blobs.html
# response-headers Google Cloud Storage respects, and what they map to in a GCS object
GCS_RESPONSE_HEADER_MAP = {
"Content-Disposition": "content_disposition",
"Content-Type": "content_type",
"Cache-Control": "cache_control",
"Content-Language": "content_language",
"Content-Encoding": "content_encoding",
}

# Storage-type mapped to storage-response-map
STORAGE_RESPONSE_MAP = {
"storages.backends.s3boto3.S3Boto3Storage": S3_RESPONSE_HEADER_MAP,
"storages.backends.azure_storage.AzureStorage": AZURE_RESPONSE_HEADER_MAP,
"storages.backends.gcloud.GoogleCloudStorage": GCS_RESPONSE_HEADER_MAP,
}
26 changes: 17 additions & 9 deletions pulpcore/content/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import django

from pulpcore.constants import STORAGE_RESPONSE_MAP
from pulpcore.responses import ArtifactResponse

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "pulpcore.app.settings")
Expand Down Expand Up @@ -894,6 +895,16 @@ async def _serve_content_artifact(self, content_artifact, headers, request):
Returns:
The :class:`aiohttp.web.FileResponse` for the file.
"""

def _set_params_from_headers(hdrs, storage_domain):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When we call this function, we have identified the storage class already, so i think we can spare ourselves one if and dict lookup by providing the response map instead of the storage class name.

def _set_params_from_headers(headers, response_header_map):
    params = {}
    for key, new_key in response_header_map.items():
        if key in headers:
            params[new_key] = headers[key]

or even:

def _set_params_from_headers(headers, response_header_map):
    return {new_key: headers[key] for key, new_key in response_header_map.items() if key in headers}

We'd call it by, e.g. in the s3 branch:

_set_params_from_headers(headers, S3_RESPONSE_HEADER_MAP)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Passing the mapping-dictionary "assumes" that a simple lookup is all we're ever going to want to do. If we want/need to do anything more complicated in the future, the current approach lets us do so in a single place.

In terms of performance - given the current state of the content-handler, sparing "one dictionary lookup" is not convincing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking about my performance in understanding the code. But it's not going to block this contribution.

# Map standard-response-headers to storage-object-specific keys
params = {}
if storage_domain in STORAGE_RESPONSE_MAP:
for a_key in STORAGE_RESPONSE_MAP[storage_domain]:
if hdrs.get(a_key, None):
params[STORAGE_RESPONSE_MAP[storage_domain][a_key]] = hdrs[a_key]
return params

artifact_file = content_artifact.artifact.file
artifact_name = artifact_file.name
filename = os.path.basename(content_artifact.relative_path)
Expand All @@ -909,9 +920,8 @@ async def _serve_content_artifact(self, content_artifact, headers, request):
elif not domain.redirect_to_object_storage:
return ArtifactResponse(content_artifact.artifact, headers=headers)
elif domain.storage_class == "storages.backends.s3boto3.S3Boto3Storage":
parameters = {"ResponseContentDisposition": content_disposition}
if headers.get("Content-Type"):
parameters["ResponseContentType"] = headers.get("Content-Type")
headers["Content-Disposition"] = content_disposition
parameters = _set_params_from_headers(headers, domain.storage_class)
url = URL(
artifact_file.storage.url(
artifact_name, parameters=parameters, http_method=request.method
Expand All @@ -920,15 +930,13 @@ async def _serve_content_artifact(self, content_artifact, headers, request):
)
raise HTTPFound(url)
elif domain.storage_class == "storages.backends.azure_storage.AzureStorage":
parameters = {"content_disposition": content_disposition}
if headers.get("Content-Type"):
parameters["content_type"] = headers.get("Content-Type")
headers["Content-Disposition"] = content_disposition
parameters = _set_params_from_headers(headers, domain.storage_class)
url = URL(artifact_file.storage.url(artifact_name, parameters=parameters), encoded=True)
raise HTTPFound(url)
elif domain.storage_class == "storages.backends.gcloud.GoogleCloudStorage":
parameters = {"response_disposition": content_disposition}
if headers.get("Content-Type"):
parameters["content_type"] = headers.get("Content-Type")
headers["Content-Disposition"] = content_disposition
parameters = _set_params_from_headers(headers, domain.storage_class)
url = URL(artifact_file.storage.url(artifact_name, parameters=parameters), encoded=True)
raise HTTPFound(url)
else:
Expand Down