From 213086e5c9ec0ebcefc1f99b782f0ff1c5fb297a Mon Sep 17 00:00:00 2001 From: Dan LaManna Date: Tue, 21 Feb 2023 14:39:04 -0500 Subject: [PATCH 1/2] Add back md5 to zarr object url signing The MD5 parameter needs to be part of the original signed URL, otherwise S3 will reject the upload URL. This means the MD5 header used by the client is mandatory. --- dandiapi/api/storage.py | 7 +++++-- dandiapi/zarr/models.py | 5 +++-- dandiapi/zarr/tests/test_zarr_upload.py | 6 +++--- dandiapi/zarr/views/__init__.py | 7 ++++--- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/dandiapi/api/storage.py b/dandiapi/api/storage.py index 820eda813..47c056f8e 100644 --- a/dandiapi/api/storage.py +++ b/dandiapi/api/storage.py @@ -143,13 +143,14 @@ def etag_from_blob_name(self, blob_name) -> str | None: return etag[1:-1] return etag - def generate_presigned_put_object_url(self, blob_name: str) -> str: + def generate_presigned_put_object_url(self, blob_name: str, md5: str) -> str: return self.connection.meta.client.generate_presigned_url( ClientMethod='put_object', Params={ 'Bucket': self.bucket_name, 'Key': blob_name, 'ACL': 'bucket-owner-full-control', + 'ContentMD5': md5, }, ExpiresIn=600, # TODO proper expiration ) @@ -190,7 +191,9 @@ def etag_from_blob_name(self, blob_name) -> str | None: else: return response.etag - def generate_presigned_put_object_url(self, blob_name: str) -> str: + def generate_presigned_put_object_url(self, blob_name: str, _: str) -> str: + # Note: minio-py doesn't support using Content-MD5 headers + # storage.client will generate URLs like `http://minio:9000/...` when running in # docker. To avoid this, use the secondary base_url_client which is configured to # generate URLs like `http://localhost:9000/...`. diff --git a/dandiapi/zarr/models.py b/dandiapi/zarr/models.py index 67b8e51c3..68355c6b1 100644 --- a/dandiapi/zarr/models.py +++ b/dandiapi/zarr/models.py @@ -68,9 +68,10 @@ def s3_url(self): s3_url = urlunparse((parsed[0], parsed[1], parsed[2], '', '', '')) return s3_url - def generate_upload_urls(self, paths: list[str]): + def generate_upload_urls(self, path_md5s: list[dict]): return [ - self.storage.generate_presigned_put_object_url(self.s3_path(path)) for path in paths + self.storage.generate_presigned_put_object_url(self.s3_path(o['path']), o['md5']) + for o in path_md5s ] def mark_pending(self): diff --git a/dandiapi/zarr/tests/test_zarr_upload.py b/dandiapi/zarr/tests/test_zarr_upload.py index ceac32b59..baf0e10ef 100644 --- a/dandiapi/zarr/tests/test_zarr_upload.py +++ b/dandiapi/zarr/tests/test_zarr_upload.py @@ -25,7 +25,7 @@ def test_zarr_rest_upload_start( # Request upload files resp = authenticated_api_client.post( f'/api/zarr/{zarr_archive.zarr_id}/files/', - ['foo/bar.txt'], + [{'path': 'foo/bar.txt', 'md5': '12345'}], format='json', ) assert resp.status_code == 200 @@ -41,7 +41,7 @@ def test_zarr_rest_upload_start( # Request more resp = authenticated_api_client.post( f'/api/zarr/{zarr_archive.zarr_id}/files/', - ['foo/bar2.txt'], + [{'path': 'foo/bar2.txt', 'md5': '12345'}], format='json', ) assert resp.status_code == 200 @@ -52,7 +52,7 @@ def test_zarr_rest_upload_start( def test_zarr_rest_upload_start_not_an_owner(authenticated_api_client, zarr_archive: ZarrArchive): resp = authenticated_api_client.post( f'/api/zarr/{zarr_archive.zarr_id}/files/', - ['foo/bar.txt'], + [{'path': 'foo/bar.txt', 'md5': '12345'}], format='json', ) assert resp.status_code == 403 diff --git a/dandiapi/zarr/views/__init__.py b/dandiapi/zarr/views/__init__.py index dad604ca3..dd422e700 100644 --- a/dandiapi/zarr/views/__init__.py +++ b/dandiapi/zarr/views/__init__.py @@ -24,8 +24,9 @@ logger = logging.getLogger(__name__) -class ZarrFileCreationSerializer(serializers.ListSerializer): - child = serializers.CharField() +class ZarrFileCreationSerializer(serializers.Serializer): + path = serializers.CharField() + md5 = serializers.CharField() class ZarrDeleteFileRequestSerializer(serializers.Serializer): @@ -271,7 +272,7 @@ def create_files(self, request, zarr_id): if not self.request.user.has_perm('owner', zarr_archive.dandiset): raise PermissionDenied() - serializer = ZarrFileCreationSerializer(data=request.data) + serializer = ZarrFileCreationSerializer(data=request.data, many=True) serializer.is_valid(raise_exception=True) # Generate presigned urls From ec5f69ea8dc660f892c3f5ffc467f6ee049265bc Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Tue, 21 Feb 2023 15:09:17 -0500 Subject: [PATCH 2/2] Update zarr upload process design doc --- doc/design/zarr-performance-redesign.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/design/zarr-performance-redesign.md b/doc/design/zarr-performance-redesign.md index afb0a3bf5..5955ec0a5 100644 --- a/doc/design/zarr-performance-redesign.md +++ b/doc/design/zarr-performance-redesign.md @@ -100,7 +100,7 @@ sequenceDiagram Server-->>-Client: PENDING Zarr Archive loop for each file - Client->>+Server: Request signed URLs + Client->>+Server: Request signed URLs (w/ path and md5) Server-->>-Client: A list of signed URLs Client->>+S3: Upload individual file using signed URL end @@ -120,7 +120,7 @@ sequenceDiagram Client->>+Client: Verify zarr checksum with local ``` -(Step 1): **`dandi-cli` computes the Zarr checksum locally**. (Note that this step can actually be taken any time before step 12; it is listed here arbitrarily.) +(Step 1): **`dandi-cli` computes the Zarr checksum locally**. (Steps 2 and 3): `dandi-cli` asks the server to create a new Zarr archive, which is put into the `PENDING` state.