From cea25c77235ccda205b0942881b1096aa0a4ec25 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Wed, 2 Mar 2022 10:11:19 -0500 Subject: [PATCH 1/5] Rename path to name in zarr checksum --- dandischema/digests/tests/test_zarr.py | 60 +++++++++++++------------- dandischema/digests/zarr.py | 20 ++++----- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/dandischema/digests/tests/test_zarr.py b/dandischema/digests/tests/test_zarr.py index fc6fbccd..0ada32f1 100644 --- a/dandischema/digests/tests/test_zarr.py +++ b/dandischema/digests/tests/test_zarr.py @@ -13,8 +13,8 @@ def test_zarr_checksum_sort_order(): # The a < b in the path should take precedence over z > y in the md5 - a = ZarrChecksum(path="1/2/3/a/z", md5="z") - b = ZarrChecksum(path="1/2/3/b/z", md5="y") + a = ZarrChecksum(name="a", md5="z") + b = ZarrChecksum(name="b", md5="y") assert sorted([b, a]) == [a, b] @@ -24,16 +24,16 @@ def test_zarr_checksum_sort_order(): def test_zarr_checkums_is_empty(): assert ZarrChecksums(directories=[], files=[]).is_empty assert not ZarrChecksums( - directories=[ZarrChecksum(md5="md5", path="path")], files=[] + directories=[ZarrChecksum(md5="md5", name="name")], files=[] ).is_empty assert not ZarrChecksums( - directories=[], files=[ZarrChecksum(md5="md5", path="path")] + directories=[], files=[ZarrChecksum(md5="md5", name="name")] ).is_empty -a = ZarrChecksum(path="a", md5="a") -b = ZarrChecksum(path="b", md5="b") -c = ZarrChecksum(path="c", md5="c") +a = ZarrChecksum(name="a", md5="a") +b = ZarrChecksum(name="b", md5="b") +c = ZarrChecksum(name="c", md5="c") @pytest.mark.parametrize( @@ -113,19 +113,19 @@ def test_zarr_checkums_remove_checksums( [ ([], [], "481a2f77ab786a0f45aafd5db0971caa"), ( - [ZarrChecksum(path="foo/bar", md5="a")], + [ZarrChecksum(name="bar", md5="a")], [], "cdcfdfca3622e20df03219273872549e", ), ( [], - [ZarrChecksum(path="foo/bar", md5="a")], + [ZarrChecksum(name="bar", md5="a")], "243aca82c6872222747183dd738b6fcb", ), ( [ - ZarrChecksum(path="foo/bar", md5="a"), - ZarrChecksum(path="foo/baz", md5="b"), + ZarrChecksum(name="bar", md5="a"), + ZarrChecksum(name="baz", md5="b"), ], [], "785295076ae9156b363e442ef6d485e0", @@ -133,14 +133,14 @@ def test_zarr_checkums_remove_checksums( ( [], [ - ZarrChecksum(path="foo/bar", md5="a"), - ZarrChecksum(path="foo/baz", md5="b"), + ZarrChecksum(name="bar", md5="a"), + ZarrChecksum(name="baz", md5="b"), ], "ebca8bb8e716237e0f71657d1045930f", ), ( - [ZarrChecksum(path="foo/baz", md5="a")], - [ZarrChecksum(path="foo/bar", md5="b")], + [ZarrChecksum(name="baz", md5="a")], + [ZarrChecksum(name="bar", md5="b")], "9c34644ba03b7e9f58ebd1caef4215ad", ), ], @@ -160,8 +160,8 @@ def test_zarr_checksum_serializer_aggregate_checksum( def test_zarr_checksum_serializer_generate_listing(): serializer = ZarrJSONChecksumSerializer() checksums = ZarrChecksums( - files=[ZarrChecksum(path="foo/bar", md5="a")], - directories=[ZarrChecksum(path="foo/baz", md5="b")], + files=[ZarrChecksum(name="bar", md5="a")], + directories=[ZarrChecksum(name="baz", md5="b")], ) assert serializer.generate_listing(checksums) == ZarrChecksumListing( checksums=checksums, md5="23076057c0da63f8ab50d0a108db332c" @@ -174,24 +174,24 @@ def test_zarr_serialize(): serializer.serialize( ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(path="foo/bar", md5="a")], - directories=[ZarrChecksum(path="bar/foo", md5="b")], + files=[ZarrChecksum(name="bar", md5="a")], + directories=[ZarrChecksum(name="foo", md5="b")], ), md5="c", ) ) - == '{"checksums":{"directories":[{"md5":"b","path":"bar/foo"}],"files":[{"md5":"a","path":"foo/bar"}]},"md5":"c"}' # noqa: E501 + == '{"checksums":{"directories":[{"md5":"b","name":"foo"}],"files":[{"md5":"a","name":"bar"}]},"md5":"c"}' # noqa: E501 ) def test_zarr_deserialize(): serializer = ZarrJSONChecksumSerializer() assert serializer.deserialize( - '{"checksums":{"directories":[{"md5":"b","path":"bar/foo"}],"files":[{"md5":"a","path":"foo/bar"}]},"md5":"c"}' # noqa: E501 + '{"checksums":{"directories":[{"md5":"b","name":"foo"}],"files":[{"md5":"a","name":"bar"}]},"md5":"c"}' # noqa: E501 ) == ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(path="foo/bar", md5="a")], - directories=[ZarrChecksum(path="bar/foo", md5="b")], + files=[ZarrChecksum(name="bar", md5="a")], + directories=[ZarrChecksum(name="foo", md5="b")], ), md5="c", ) @@ -201,33 +201,33 @@ def test_zarr_deserialize(): "files,directories,checksum", [ ( - {"foo/bar": "a"}, + {"bar": "a"}, {}, "cdcfdfca3622e20df03219273872549e", ), ( {}, - {"foo/bar": "a"}, + {"bar": "a"}, "243aca82c6872222747183dd738b6fcb", ), ( - {"foo/bar": "a", "foo/baz": "b"}, + {"bar": "a", "baz": "b"}, {}, "785295076ae9156b363e442ef6d485e0", ), ( {}, - {"foo/bar": "a", "foo/baz": "b"}, + {"bar": "a", "baz": "b"}, "ebca8bb8e716237e0f71657d1045930f", ), ( {}, - {"foo/baz": "b", "foo/bar": "a"}, + {"baz": "b", "bar": "a"}, "ebca8bb8e716237e0f71657d1045930f", ), ( - {"foo/baz": "a"}, - {"foo/bar": "b"}, + {"baz": "a"}, + {"bar": "b"}, "9c34644ba03b7e9f58ebd1caef4215ad", ), ], diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py index 93b019ef..f51fe912 100644 --- a/dandischema/digests/zarr.py +++ b/dandischema/digests/zarr.py @@ -15,15 +15,15 @@ class ZarrChecksum(pydantic.BaseModel): """ A checksum for a single file/directory in a zarr file. - Every file and directory in a zarr archive has a path and a MD5 hash. + Every file and directory in a zarr archive has a name and a MD5 hash. """ md5: str - path: str + name: str # To make ZarrChecksums sortable def __lt__(self, other: ZarrChecksum): - return self.path < other.path + return self.name < other.name class ZarrChecksums(pydantic.BaseModel): @@ -43,7 +43,7 @@ def is_empty(self): def _index(self, checksums: List[ZarrChecksum], checksum: ZarrChecksum): # O(n) performance, consider using the bisect module or an ordered dict for optimization for i in range(0, len(checksums)): - if checksums[i].path == checksum.path: + if checksums[i].name == checksum.name: return i raise ValueError("Not found") @@ -66,13 +66,13 @@ def add_directory_checksums(self, checksums: List[ZarrChecksum]): self.directories.append(new_checksum) self.directories = sorted(self.directories) - def remove_checksums(self, paths: List[str]): - """Remove a list of paths from the listing.""" + def remove_checksums(self, names: List[str]): + """Remove a list of names from the listing.""" self.files = sorted( - filter(lambda checksum: checksum.path not in paths, self.files) + filter(lambda checksum: checksum.name not in names, self.files) ) self.directories = sorted( - filter(lambda checksum: checksum.path not in paths, self.directories) + filter(lambda checksum: checksum.name not in names, self.directories) ) @@ -142,9 +142,9 @@ def get_checksum(files: Dict[str, str], directories: Dict[str, str]) -> str: if not files and not directories: raise ValueError("Cannot compute a Zarr checksum for an empty directory") checksum_listing = ZarrJSONChecksumSerializer().generate_listing( - files=[ZarrChecksum(md5=md5, path=path) for path, md5 in files.items()], + files=[ZarrChecksum(md5=md5, name=name) for name, md5 in files.items()], directories=[ - ZarrChecksum(md5=md5, path=path) for path, md5 in directories.items() + ZarrChecksum(md5=md5, name=name) for name, md5 in directories.items() ], ) return checksum_listing.md5 From a117865d9bead86edf065a76125239848a3ae03d Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Wed, 2 Mar 2022 10:30:03 -0500 Subject: [PATCH 2/5] Add size to zarr checksums --- dandischema/digests/tests/test_zarr.py | 88 ++++++++++++++------------ dandischema/digests/zarr.py | 20 ++++-- 2 files changed, 62 insertions(+), 46 deletions(-) diff --git a/dandischema/digests/tests/test_zarr.py b/dandischema/digests/tests/test_zarr.py index 0ada32f1..a563706e 100644 --- a/dandischema/digests/tests/test_zarr.py +++ b/dandischema/digests/tests/test_zarr.py @@ -13,8 +13,8 @@ def test_zarr_checksum_sort_order(): # The a < b in the path should take precedence over z > y in the md5 - a = ZarrChecksum(name="a", md5="z") - b = ZarrChecksum(name="b", md5="y") + a = ZarrChecksum(name="a", md5="z", size=1) + b = ZarrChecksum(name="b", md5="y", size=1) assert sorted([b, a]) == [a, b] @@ -24,16 +24,16 @@ def test_zarr_checksum_sort_order(): def test_zarr_checkums_is_empty(): assert ZarrChecksums(directories=[], files=[]).is_empty assert not ZarrChecksums( - directories=[ZarrChecksum(md5="md5", name="name")], files=[] + directories=[ZarrChecksum(md5="md5", name="name", size=1)], files=[] ).is_empty assert not ZarrChecksums( - directories=[], files=[ZarrChecksum(md5="md5", name="name")] + directories=[], files=[ZarrChecksum(md5="md5", name="name", size=1)] ).is_empty -a = ZarrChecksum(name="a", md5="a") -b = ZarrChecksum(name="b", md5="b") -c = ZarrChecksum(name="c", md5="c") +a = ZarrChecksum(name="a", md5="a", size=1) +b = ZarrChecksum(name="b", md5="b", size=1) +c = ZarrChecksum(name="c", md5="c", size=1) @pytest.mark.parametrize( @@ -113,35 +113,35 @@ def test_zarr_checkums_remove_checksums( [ ([], [], "481a2f77ab786a0f45aafd5db0971caa"), ( - [ZarrChecksum(name="bar", md5="a")], + [ZarrChecksum(name="bar", md5="a", size=1)], [], - "cdcfdfca3622e20df03219273872549e", + "677dddd9af150be166c461acdef1b025", ), ( [], - [ZarrChecksum(name="bar", md5="a")], - "243aca82c6872222747183dd738b6fcb", + [ZarrChecksum(name="bar", md5="a", size=1)], + "aa776d184c64cbd6a5956ab0af012830", ), ( [ - ZarrChecksum(name="bar", md5="a"), - ZarrChecksum(name="baz", md5="b"), + ZarrChecksum(name="bar", md5="a", size=1), + ZarrChecksum(name="baz", md5="b", size=1), ], [], - "785295076ae9156b363e442ef6d485e0", + "c8a9b1dd53bb43ec6e5d379c29a1f1dd", ), ( [], [ - ZarrChecksum(name="bar", md5="a"), - ZarrChecksum(name="baz", md5="b"), + ZarrChecksum(name="bar", md5="a", size=1), + ZarrChecksum(name="baz", md5="b", size=1), ], - "ebca8bb8e716237e0f71657d1045930f", + "f45aa3833a2129628a38e421f74ff792", ), ( - [ZarrChecksum(name="baz", md5="a")], - [ZarrChecksum(name="bar", md5="b")], - "9c34644ba03b7e9f58ebd1caef4215ad", + [ZarrChecksum(name="baz", md5="a", size=1)], + [ZarrChecksum(name="bar", md5="b", size=1)], + "bc0a0e85a0205eb3cb5f163f173774e5", ), ], ) @@ -160,11 +160,13 @@ def test_zarr_checksum_serializer_aggregate_checksum( def test_zarr_checksum_serializer_generate_listing(): serializer = ZarrJSONChecksumSerializer() checksums = ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a")], - directories=[ZarrChecksum(name="baz", md5="b")], + files=[ZarrChecksum(name="bar", md5="a", size=1)], + directories=[ZarrChecksum(name="baz", md5="b", size=2)], ) assert serializer.generate_listing(checksums) == ZarrChecksumListing( - checksums=checksums, md5="23076057c0da63f8ab50d0a108db332c" + checksums=checksums, + md5="c20479b1afe558a919eac450028a706e", + size=3, ) @@ -174,26 +176,28 @@ def test_zarr_serialize(): serializer.serialize( ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a")], - directories=[ZarrChecksum(name="foo", md5="b")], + files=[ZarrChecksum(name="bar", md5="a", size=1)], + directories=[ZarrChecksum(name="foo", md5="b", size=2)], ), md5="c", + size=3, ) ) - == '{"checksums":{"directories":[{"md5":"b","name":"foo"}],"files":[{"md5":"a","name":"bar"}]},"md5":"c"}' # noqa: E501 + == '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}' # noqa: E501 ) def test_zarr_deserialize(): serializer = ZarrJSONChecksumSerializer() assert serializer.deserialize( - '{"checksums":{"directories":[{"md5":"b","name":"foo"}],"files":[{"md5":"a","name":"bar"}]},"md5":"c"}' # noqa: E501 + '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}' # noqa: E501 ) == ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a")], - directories=[ZarrChecksum(name="foo", md5="b")], + files=[ZarrChecksum(name="bar", md5="a", size=1)], + directories=[ZarrChecksum(name="foo", md5="b", size=2)], ), md5="c", + size=3, ) @@ -201,34 +205,34 @@ def test_zarr_deserialize(): "files,directories,checksum", [ ( - {"bar": "a"}, + {"bar": ("a", 1)}, {}, - "cdcfdfca3622e20df03219273872549e", + "677dddd9af150be166c461acdef1b025", ), ( {}, - {"bar": "a"}, - "243aca82c6872222747183dd738b6fcb", + {"bar": ("a", 1)}, + "aa776d184c64cbd6a5956ab0af012830", ), ( - {"bar": "a", "baz": "b"}, + {"bar": ("a", 1), "baz": ("b", 2)}, {}, - "785295076ae9156b363e442ef6d485e0", + "66c03ae00824e6be1283cc370969f6ea", ), ( {}, - {"bar": "a", "baz": "b"}, - "ebca8bb8e716237e0f71657d1045930f", + {"bar": ("a", 1), "baz": ("b", 2)}, + "6969470da4b829f0a8b665ac78350abd", ), ( {}, - {"baz": "b", "bar": "a"}, - "ebca8bb8e716237e0f71657d1045930f", + {"baz": ("b", 1), "bar": ("a", 2)}, + "25f351bbdcfb33f7706f7ef1e80cb010", ), ( - {"baz": "a"}, - {"bar": "b"}, - "9c34644ba03b7e9f58ebd1caef4215ad", + {"baz": ("a", 1)}, + {"bar": ("b", 2)}, + "a9540738019a48e6392c942217f7526d", ), ], ) diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py index f51fe912..ee726c33 100644 --- a/dandischema/digests/zarr.py +++ b/dandischema/digests/zarr.py @@ -2,7 +2,7 @@ from functools import total_ordering import hashlib -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple import pydantic @@ -20,6 +20,7 @@ class ZarrChecksum(pydantic.BaseModel): md5: str name: str + size: int # To make ZarrChecksums sortable def __lt__(self, other: ZarrChecksum): @@ -85,6 +86,7 @@ class ZarrChecksumListing(pydantic.BaseModel): checksums: ZarrChecksums md5: str + size: int class ZarrJSONChecksumSerializer: @@ -125,9 +127,13 @@ def generate_listing( files=sorted(files) if files is not None else [], directories=sorted(directories) if directories is not None else [], ) + size = sum([file.size for file in checksums.files]) + sum( + [directory.size for directory in checksums.directories] + ) return ZarrChecksumListing( checksums=checksums, md5=self.aggregate_checksum(checksums), + size=size, ) @@ -137,14 +143,20 @@ def generate_listing( EMPTY_CHECKSUM = ZarrJSONChecksumSerializer().generate_listing(ZarrChecksums()).md5 -def get_checksum(files: Dict[str, str], directories: Dict[str, str]) -> str: +def get_checksum( + files: Dict[str, Tuple[str, int]], directories: Dict[str, Tuple[str, int]] +) -> str: """Calculate the checksum of a directory.""" if not files and not directories: raise ValueError("Cannot compute a Zarr checksum for an empty directory") checksum_listing = ZarrJSONChecksumSerializer().generate_listing( - files=[ZarrChecksum(md5=md5, name=name) for name, md5 in files.items()], + files=[ + ZarrChecksum(md5=md5, name=name, size=size) + for name, (md5, size) in files.items() + ], directories=[ - ZarrChecksum(md5=md5, name=name) for name, md5 in directories.items() + ZarrChecksum(md5=md5, name=name, size=size) + for name, (md5, size) in directories.items() ], ) return checksum_listing.md5 From 2bd30c33c6dc2afbc0e0f743107211439dfe6653 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Wed, 2 Mar 2022 12:03:12 -0500 Subject: [PATCH 3/5] Remove unnecessary list comprehension --- dandischema/digests/zarr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py index ee726c33..c3cebf7f 100644 --- a/dandischema/digests/zarr.py +++ b/dandischema/digests/zarr.py @@ -127,8 +127,8 @@ def generate_listing( files=sorted(files) if files is not None else [], directories=sorted(directories) if directories is not None else [], ) - size = sum([file.size for file in checksums.files]) + sum( - [directory.size for directory in checksums.directories] + size = sum(file.size for file in checksums.files) + sum( + directory.size for directory in checksums.directories ) return ZarrChecksumListing( checksums=checksums, From 38efd136ecac2b4359cdff2045f5bb2ee0fe0dc5 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Wed, 2 Mar 2022 16:46:47 -0500 Subject: [PATCH 4/5] Enhance zarr checksum digests --- dandischema/digests/tests/test_zarr.py | 181 +++++++++++++++++-------- dandischema/digests/zarr.py | 54 +++++--- 2 files changed, 161 insertions(+), 74 deletions(-) diff --git a/dandischema/digests/tests/test_zarr.py b/dandischema/digests/tests/test_zarr.py index a563706e..67f1e4b4 100644 --- a/dandischema/digests/tests/test_zarr.py +++ b/dandischema/digests/tests/test_zarr.py @@ -12,9 +12,9 @@ def test_zarr_checksum_sort_order(): - # The a < b in the path should take precedence over z > y in the md5 - a = ZarrChecksum(name="a", md5="z", size=1) - b = ZarrChecksum(name="b", md5="y", size=1) + # The a < b in the path should take precedence over z > y in the checksum + a = ZarrChecksum(name="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", digest="z", size=1) + b = ZarrChecksum(name="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", digest="y", size=1) assert sorted([b, a]) == [a, b] @@ -24,16 +24,24 @@ def test_zarr_checksum_sort_order(): def test_zarr_checkums_is_empty(): assert ZarrChecksums(directories=[], files=[]).is_empty assert not ZarrChecksums( - directories=[ZarrChecksum(md5="md5", name="name", size=1)], files=[] + directories=[ZarrChecksum(digest="checksum", name="name", size=1)], files=[] ).is_empty assert not ZarrChecksums( - directories=[], files=[ZarrChecksum(md5="md5", name="name", size=1)] + directories=[], files=[ZarrChecksum(digest="checksum", name="name", size=1)] ).is_empty -a = ZarrChecksum(name="a", md5="a", size=1) -b = ZarrChecksum(name="b", md5="b", size=1) -c = ZarrChecksum(name="c", md5="c", size=1) +a = ZarrChecksum( + name="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + size=1, +) +b = ZarrChecksum( + name="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + size=1, +) +c = ZarrChecksum(name="c", digest="c", size=1) @pytest.mark.parametrize( @@ -84,12 +92,12 @@ def test_zarr_checkums_add_directory_checksums(initial, new_checksums, expected) ), [ ([], [], [], [], []), - ([a], [], ["a"], [], []), - ([], [a], ["a"], [], []), - ([a], [b], ["a"], [], [b]), - ([a], [b], ["b"], [a], []), - ([a, b, c], [], ["b"], [a, c], []), - ([], [a, b, c], ["b"], [], [a, c]), + ([a], [], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], []), + ([], [a], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], []), + ([a], [b], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], [b]), + ([a], [b], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [a], []), + ([a, b, c], [], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [a, c], []), + ([], [a, b, c], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [], [a, c]), ], ) def test_zarr_checkums_remove_checksums( @@ -109,63 +117,93 @@ def test_zarr_checkums_remove_checksums( @pytest.mark.parametrize( - "file_checksums,directory_checksums,checksum", + "file_checksums,directory_checksums,digest", [ - ([], [], "481a2f77ab786a0f45aafd5db0971caa"), + ([], [], "481a2f77ab786a0f45aafd5db0971caa-0--0"), ( - [ZarrChecksum(name="bar", md5="a", size=1)], + [ + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ) + ], [], - "677dddd9af150be166c461acdef1b025", + "f21b9b4bf53d7ce1167bcfae76371e59-1--1", ), ( [], - [ZarrChecksum(name="bar", md5="a", size=1)], - "aa776d184c64cbd6a5956ab0af012830", + [ + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", size=1 + ) + ], + "ea8b8290b69b96422a3ed1cca0390f21-1--1", ), ( [ - ZarrChecksum(name="bar", md5="a", size=1), - ZarrChecksum(name="baz", md5="b", size=1), + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ), + ZarrChecksum( + name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", size=1 + ), ], [], - "c8a9b1dd53bb43ec6e5d379c29a1f1dd", + "8e50add2b46d3a6389e2d9d0924227fb-2--2", ), ( [], [ - ZarrChecksum(name="bar", md5="a", size=1), - ZarrChecksum(name="baz", md5="b", size=1), + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", size=1 + ), + ZarrChecksum( + name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", size=1 + ), ], - "f45aa3833a2129628a38e421f74ff792", + "4c21a113688f925240549b14136d61ff-2--2", ), ( - [ZarrChecksum(name="baz", md5="a", size=1)], - [ZarrChecksum(name="bar", md5="b", size=1)], - "bc0a0e85a0205eb3cb5f163f173774e5", + [ + ZarrChecksum( + name="baz", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ) + ], + [ + ZarrChecksum( + name="bar", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", size=1 + ) + ], + "d5e4eb5dc8efdb54ff089db1eef34119-2--2", ), ], ) -def test_zarr_checksum_serializer_aggregate_checksum( - file_checksums, directory_checksums, checksum +def test_zarr_checksum_serializer_aggregate_digest( + file_checksums, directory_checksums, digest ): serializer = ZarrJSONChecksumSerializer() assert ( - serializer.aggregate_checksum( + serializer.aggregate_digest( ZarrChecksums(files=file_checksums, directories=directory_checksums) ) - == checksum + == digest ) def test_zarr_checksum_serializer_generate_listing(): serializer = ZarrJSONChecksumSerializer() checksums = ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a", size=1)], - directories=[ZarrChecksum(name="baz", md5="b", size=2)], + files=[ + ZarrChecksum(name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1) + ], + directories=[ + ZarrChecksum( + name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", size=2 + ) + ], ) assert serializer.generate_listing(checksums) == ZarrChecksumListing( checksums=checksums, - md5="c20479b1afe558a919eac450028a706e", + digest="baf791d7bac84947c14739b1684ec5ab-2--3", size=3, ) @@ -176,27 +214,47 @@ def test_zarr_serialize(): serializer.serialize( ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a", size=1)], - directories=[ZarrChecksum(name="foo", md5="b", size=2)], + files=[ + ZarrChecksum( + name="bar", + digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + size=1, + ) + ], + directories=[ + ZarrChecksum( + name="foo", + digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", + size=2, + ) + ], ), - md5="c", + digest="cccccccccccccccccccccccccccccccc-2--3", size=3, ) ) - == '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}' # noqa: E501 + == '{"checksums":{"directories":[{"digest":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2","name":"foo","size":2}],"files":[{"digest":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","name":"bar","size":1}]},"digest":"cccccccccccccccccccccccccccccccc-2--3","size":3}' # noqa: E501 ) def test_zarr_deserialize(): serializer = ZarrJSONChecksumSerializer() assert serializer.deserialize( - '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}' # noqa: E501 + '{"checksums":{"directories":[{"digest":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2","name":"foo","size":2}],"files":[{"digest":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","name":"bar","size":1}]},"digest":"cccccccccccccccccccccccccccccccc-2--3","size":3}' # noqa: E501 ) == ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a", size=1)], - directories=[ZarrChecksum(name="foo", md5="b", size=2)], + files=[ + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ) + ], + directories=[ + ZarrChecksum( + name="foo", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", size=2 + ) + ], ), - md5="c", + digest="cccccccccccccccccccccccccccccccc-2--3", size=3, ) @@ -205,34 +263,43 @@ def test_zarr_deserialize(): "files,directories,checksum", [ ( - {"bar": ("a", 1)}, + {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, {}, - "677dddd9af150be166c461acdef1b025", + "f21b9b4bf53d7ce1167bcfae76371e59-1--1", ), ( {}, - {"bar": ("a", 1)}, - "aa776d184c64cbd6a5956ab0af012830", + {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1)}, + "ea8b8290b69b96422a3ed1cca0390f21-1--1", ), ( - {"bar": ("a", 1), "baz": ("b", 2)}, + { + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1), + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", 2), + }, {}, - "66c03ae00824e6be1283cc370969f6ea", + "4e67de4393d14c1e9c472438f0f1f8b1-2--3", ), ( {}, - {"bar": ("a", 1), "baz": ("b", 2)}, - "6969470da4b829f0a8b665ac78350abd", + { + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1), + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2), + }, + "859ca1926affe9c7d0424030f26fbd89-2--3", ), ( {}, - {"baz": ("b", 1), "bar": ("a", 2)}, - "25f351bbdcfb33f7706f7ef1e80cb010", + { + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", 1), + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--2", 2), + }, + "8f8361a286c9a7c3fbfd464e33989037-2--3", ), ( - {"baz": ("a", 1)}, - {"bar": ("b", 2)}, - "a9540738019a48e6392c942217f7526d", + {"baz": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, + {"bar": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2)}, + "3cb139f47d3a3580388f41956c15f55e-2--3", ), ], ) diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py index c3cebf7f..b4bf8af2 100644 --- a/dandischema/digests/zarr.py +++ b/dandischema/digests/zarr.py @@ -2,6 +2,7 @@ from functools import total_ordering import hashlib +import re from typing import Dict, List, Optional, Tuple import pydantic @@ -10,15 +11,28 @@ ENCODING_KWARGS = {"separators": (",", ":")} +def generate_directory_digest(md5: str, file_count: int, size: int): + """Generate a directory digest from its constituent parts""" + return f"{md5}-{file_count}--{size}" + + +def parse_directory_digest(digest: str): + """Parse a directory digest into its constituent parts""" + match = re.match("([0-9a-f]{32})-([0-9]+)--([0-9]+)", digest) + if match is None: + raise ValueError(f"Cannot parse directory digest {digest}") + return match.group(1), int(match.group(2)), int(match.group(3)) + + @total_ordering class ZarrChecksum(pydantic.BaseModel): """ A checksum for a single file/directory in a zarr file. - Every file and directory in a zarr archive has a name and a MD5 hash. + Every file and directory in a zarr archive has a name, digest, and size. """ - md5: str + digest: str name: str size: int @@ -85,19 +99,27 @@ class ZarrChecksumListing(pydantic.BaseModel): """ checksums: ZarrChecksums - md5: str + digest: str size: int class ZarrJSONChecksumSerializer: - def aggregate_checksum(self, checksums: ZarrChecksums) -> str: - """Generate an aggregated checksum for a list of ZarrChecksums.""" + def aggregate_digest(self, checksums: ZarrChecksums) -> str: + """Generate an aggregated digest for a list of ZarrChecksums.""" # Use the most compact separators possible # content = json.dumps([asdict(zarr_md5) for zarr_md5 in checksums], separators=(',', ':'))0 content = checksums.json(**ENCODING_KWARGS) h = hashlib.md5() h.update(content.encode("utf-8")) - return h.hexdigest() + md5 = h.hexdigest() + file_count = sum( + parse_directory_digest(checksum.digest)[1] + for checksum in checksums.directories + ) + len(checksums.files) + size = sum(file.size for file in checksums.files) + sum( + directory.size for directory in checksums.directories + ) + return generate_directory_digest(md5, file_count, size) def serialize(self, zarr_checksum_listing: ZarrChecksumListing) -> str: """Serialize a ZarrChecksumListing into a string.""" @@ -127,20 +149,18 @@ def generate_listing( files=sorted(files) if files is not None else [], directories=sorted(directories) if directories is not None else [], ) - size = sum(file.size for file in checksums.files) + sum( - directory.size for directory in checksums.directories - ) + digest = self.aggregate_digest(checksums) return ZarrChecksumListing( checksums=checksums, - md5=self.aggregate_checksum(checksums), - size=size, + digest=digest, + size=parse_directory_digest(digest)[2], ) # We do not store a checksum file for empty directories since an empty directory doesn't exist in # S3. However, an empty zarr file still needs to have a checksum, even if it has no checksum file. # For convenience, we define this constant as the "null" checksum. -EMPTY_CHECKSUM = ZarrJSONChecksumSerializer().generate_listing(ZarrChecksums()).md5 +EMPTY_CHECKSUM = ZarrJSONChecksumSerializer().generate_listing(ZarrChecksums()).digest def get_checksum( @@ -151,12 +171,12 @@ def get_checksum( raise ValueError("Cannot compute a Zarr checksum for an empty directory") checksum_listing = ZarrJSONChecksumSerializer().generate_listing( files=[ - ZarrChecksum(md5=md5, name=name, size=size) - for name, (md5, size) in files.items() + ZarrChecksum(digest=digest, name=name, size=size) + for name, (digest, size) in files.items() ], directories=[ - ZarrChecksum(md5=md5, name=name, size=size) - for name, (md5, size) in directories.items() + ZarrChecksum(digest=digest, name=name, size=size) + for name, (digest, size) in directories.items() ], ) - return checksum_listing.md5 + return checksum_listing.digest From 02972c5f698e702f37b545397d9eaf2ba554b28b Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Wed, 16 Mar 2022 10:48:49 -0400 Subject: [PATCH 5/5] Add new zarr checksum format to model --- dandischema/digests/zarr.py | 3 ++- dandischema/models.py | 9 ++++++++- dandischema/tests/test_models.py | 25 +++++++++++++++++-------- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py index b4bf8af2..d887d6d5 100644 --- a/dandischema/digests/zarr.py +++ b/dandischema/digests/zarr.py @@ -9,6 +9,7 @@ """Passed to the json() method of pydantic models for serialization.""" ENCODING_KWARGS = {"separators": (",", ":")} +ZARR_CHECKSUM_PATTERN = "([0-9a-f]{32})-([0-9]+)--([0-9]+)" def generate_directory_digest(md5: str, file_count: int, size: int): @@ -18,7 +19,7 @@ def generate_directory_digest(md5: str, file_count: int, size: int): def parse_directory_digest(digest: str): """Parse a directory digest into its constituent parts""" - match = re.match("([0-9a-f]{32})-([0-9]+)--([0-9]+)", digest) + match = re.match(ZARR_CHECKSUM_PATTERN, digest) if match is None: raise ValueError(f"Cannot parse directory digest {digest}") return match.group(1), int(match.group(2)), int(match.group(3)) diff --git a/dandischema/models.py b/dandischema/models.py index b791504c..0f16dd63 100644 --- a/dandischema/models.py +++ b/dandischema/models.py @@ -22,6 +22,7 @@ from .consts import DANDI_SCHEMA_VERSION from .digests.dandietag import DandiETag +from .digests.zarr import ZARR_CHECKSUM_PATTERN, parse_directory_digest from .model_types import ( AccessTypeDict, AgeReferenceTypeDict, @@ -1124,11 +1125,17 @@ def digest_check(cls, v, values, **kwargs): if v.get(DigestType.dandi_etag): raise ValueError("Digest cannot have both etag and zarr checksums.") digest = v.get(DigestType.dandi_zarr_checksum) - if not re.fullmatch(MD5_PATTERN, digest): + if not re.fullmatch(ZARR_CHECKSUM_PATTERN, digest): raise ValueError( f"Digest must have an appropriate dandi-zarr-checksum value. " f"Got {digest}" ) + _checksum, _file_count, zarr_size = parse_directory_digest(digest) + content_size = values.get("contentSize") + if content_size != zarr_size: + raise ValueError( + f"contentSize {content_size} is not equal to the checksum size {zarr_size}." + ) else: if DigestType.dandi_etag not in v: raise ValueError("A non-zarr asset must have a dandi-etag.") diff --git a/dandischema/tests/test_models.py b/dandischema/tests/test_models.py index 44f8ed8d..41aaf6a1 100644 --- a/dandischema/tests/test_models.py +++ b/dandischema/tests/test_models.py @@ -118,7 +118,7 @@ def test_asset_digest(): for el in exc.value.errors() ) - digest = 33 * "a" + digest = 32 * "a" digest_model = {models.DigestType.dandi_zarr_checksum: digest} with pytest.raises(pydantic.ValidationError) as exc: models.BareAsset( @@ -133,14 +133,23 @@ def test_asset_digest(): for val in set([el["msg"] for el in exc.value.errors()]) ] ) - digest = 32 * "a" + digest = f"{32 * 'a'}-1--42" digest_model = {models.DigestType.dandi_zarr_checksum: digest} - models.BareAsset( - contentSize=100, - encodingFormat="application/x-zarr", - digest=digest_model, - path="/", + with pytest.raises(pydantic.ValidationError) as exc: + models.BareAsset( + contentSize=100, + encodingFormat="application/x-zarr", + digest=digest_model, + path="/", + ) + assert any( + [ + "contentSize 100 is not equal to the checksum size 42." in val + for val in set([el["msg"] for el in exc.value.errors()]) + ] ) + digest = f"{32 * 'a'}-1--100" + digest_model = {models.DigestType.dandi_zarr_checksum: digest} with pytest.raises(pydantic.ValidationError) as exc: models.PublishedAsset( contentSize=100, @@ -357,7 +366,7 @@ def test_autogenerated_titles(): def test_dantimeta_1(): - """ checking basic metadata for publishing""" + """checking basic metadata for publishing""" # meta data without doi, datePublished and publishedBy meta_dict = { "identifier": "DANDI:999999",