From 38efd136ecac2b4359cdff2045f5bb2ee0fe0dc5 Mon Sep 17 00:00:00 2001 From: Daniel Chiquito Date: Wed, 2 Mar 2022 16:46:47 -0500 Subject: [PATCH] Enhance zarr checksum digests --- dandischema/digests/tests/test_zarr.py | 181 +++++++++++++++++-------- dandischema/digests/zarr.py | 54 +++++--- 2 files changed, 161 insertions(+), 74 deletions(-) diff --git a/dandischema/digests/tests/test_zarr.py b/dandischema/digests/tests/test_zarr.py index a563706e..67f1e4b4 100644 --- a/dandischema/digests/tests/test_zarr.py +++ b/dandischema/digests/tests/test_zarr.py @@ -12,9 +12,9 @@ def test_zarr_checksum_sort_order(): - # The a < b in the path should take precedence over z > y in the md5 - a = ZarrChecksum(name="a", md5="z", size=1) - b = ZarrChecksum(name="b", md5="y", size=1) + # The a < b in the path should take precedence over z > y in the checksum + a = ZarrChecksum(name="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", digest="z", size=1) + b = ZarrChecksum(name="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", digest="y", size=1) assert sorted([b, a]) == [a, b] @@ -24,16 +24,24 @@ def test_zarr_checksum_sort_order(): def test_zarr_checkums_is_empty(): assert ZarrChecksums(directories=[], files=[]).is_empty assert not ZarrChecksums( - directories=[ZarrChecksum(md5="md5", name="name", size=1)], files=[] + directories=[ZarrChecksum(digest="checksum", name="name", size=1)], files=[] ).is_empty assert not ZarrChecksums( - directories=[], files=[ZarrChecksum(md5="md5", name="name", size=1)] + directories=[], files=[ZarrChecksum(digest="checksum", name="name", size=1)] ).is_empty -a = ZarrChecksum(name="a", md5="a", size=1) -b = ZarrChecksum(name="b", md5="b", size=1) -c = ZarrChecksum(name="c", md5="c", size=1) +a = ZarrChecksum( + name="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + size=1, +) +b = ZarrChecksum( + name="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + size=1, +) +c = ZarrChecksum(name="c", digest="c", size=1) @pytest.mark.parametrize( @@ -84,12 +92,12 @@ def test_zarr_checkums_add_directory_checksums(initial, new_checksums, expected) ), [ ([], [], [], [], []), - ([a], [], ["a"], [], []), - ([], [a], ["a"], [], []), - ([a], [b], ["a"], [], [b]), - ([a], [b], ["b"], [a], []), - ([a, b, c], [], ["b"], [a, c], []), - ([], [a, b, c], ["b"], [], [a, c]), + ([a], [], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], []), + ([], [a], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], []), + ([a], [b], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], [b]), + ([a], [b], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [a], []), + ([a, b, c], [], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [a, c], []), + ([], [a, b, c], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [], [a, c]), ], ) def test_zarr_checkums_remove_checksums( @@ -109,63 +117,93 @@ def test_zarr_checkums_remove_checksums( @pytest.mark.parametrize( - "file_checksums,directory_checksums,checksum", + "file_checksums,directory_checksums,digest", [ - ([], [], "481a2f77ab786a0f45aafd5db0971caa"), + ([], [], "481a2f77ab786a0f45aafd5db0971caa-0--0"), ( - [ZarrChecksum(name="bar", md5="a", size=1)], + [ + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ) + ], [], - "677dddd9af150be166c461acdef1b025", + "f21b9b4bf53d7ce1167bcfae76371e59-1--1", ), ( [], - [ZarrChecksum(name="bar", md5="a", size=1)], - "aa776d184c64cbd6a5956ab0af012830", + [ + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", size=1 + ) + ], + "ea8b8290b69b96422a3ed1cca0390f21-1--1", ), ( [ - ZarrChecksum(name="bar", md5="a", size=1), - ZarrChecksum(name="baz", md5="b", size=1), + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ), + ZarrChecksum( + name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", size=1 + ), ], [], - "c8a9b1dd53bb43ec6e5d379c29a1f1dd", + "8e50add2b46d3a6389e2d9d0924227fb-2--2", ), ( [], [ - ZarrChecksum(name="bar", md5="a", size=1), - ZarrChecksum(name="baz", md5="b", size=1), + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", size=1 + ), + ZarrChecksum( + name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", size=1 + ), ], - "f45aa3833a2129628a38e421f74ff792", + "4c21a113688f925240549b14136d61ff-2--2", ), ( - [ZarrChecksum(name="baz", md5="a", size=1)], - [ZarrChecksum(name="bar", md5="b", size=1)], - "bc0a0e85a0205eb3cb5f163f173774e5", + [ + ZarrChecksum( + name="baz", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ) + ], + [ + ZarrChecksum( + name="bar", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", size=1 + ) + ], + "d5e4eb5dc8efdb54ff089db1eef34119-2--2", ), ], ) -def test_zarr_checksum_serializer_aggregate_checksum( - file_checksums, directory_checksums, checksum +def test_zarr_checksum_serializer_aggregate_digest( + file_checksums, directory_checksums, digest ): serializer = ZarrJSONChecksumSerializer() assert ( - serializer.aggregate_checksum( + serializer.aggregate_digest( ZarrChecksums(files=file_checksums, directories=directory_checksums) ) - == checksum + == digest ) def test_zarr_checksum_serializer_generate_listing(): serializer = ZarrJSONChecksumSerializer() checksums = ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a", size=1)], - directories=[ZarrChecksum(name="baz", md5="b", size=2)], + files=[ + ZarrChecksum(name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1) + ], + directories=[ + ZarrChecksum( + name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", size=2 + ) + ], ) assert serializer.generate_listing(checksums) == ZarrChecksumListing( checksums=checksums, - md5="c20479b1afe558a919eac450028a706e", + digest="baf791d7bac84947c14739b1684ec5ab-2--3", size=3, ) @@ -176,27 +214,47 @@ def test_zarr_serialize(): serializer.serialize( ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a", size=1)], - directories=[ZarrChecksum(name="foo", md5="b", size=2)], + files=[ + ZarrChecksum( + name="bar", + digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + size=1, + ) + ], + directories=[ + ZarrChecksum( + name="foo", + digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", + size=2, + ) + ], ), - md5="c", + digest="cccccccccccccccccccccccccccccccc-2--3", size=3, ) ) - == '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}' # noqa: E501 + == '{"checksums":{"directories":[{"digest":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2","name":"foo","size":2}],"files":[{"digest":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","name":"bar","size":1}]},"digest":"cccccccccccccccccccccccccccccccc-2--3","size":3}' # noqa: E501 ) def test_zarr_deserialize(): serializer = ZarrJSONChecksumSerializer() assert serializer.deserialize( - '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}' # noqa: E501 + '{"checksums":{"directories":[{"digest":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2","name":"foo","size":2}],"files":[{"digest":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","name":"bar","size":1}]},"digest":"cccccccccccccccccccccccccccccccc-2--3","size":3}' # noqa: E501 ) == ZarrChecksumListing( checksums=ZarrChecksums( - files=[ZarrChecksum(name="bar", md5="a", size=1)], - directories=[ZarrChecksum(name="foo", md5="b", size=2)], + files=[ + ZarrChecksum( + name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 + ) + ], + directories=[ + ZarrChecksum( + name="foo", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", size=2 + ) + ], ), - md5="c", + digest="cccccccccccccccccccccccccccccccc-2--3", size=3, ) @@ -205,34 +263,43 @@ def test_zarr_deserialize(): "files,directories,checksum", [ ( - {"bar": ("a", 1)}, + {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, {}, - "677dddd9af150be166c461acdef1b025", + "f21b9b4bf53d7ce1167bcfae76371e59-1--1", ), ( {}, - {"bar": ("a", 1)}, - "aa776d184c64cbd6a5956ab0af012830", + {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1)}, + "ea8b8290b69b96422a3ed1cca0390f21-1--1", ), ( - {"bar": ("a", 1), "baz": ("b", 2)}, + { + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1), + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", 2), + }, {}, - "66c03ae00824e6be1283cc370969f6ea", + "4e67de4393d14c1e9c472438f0f1f8b1-2--3", ), ( {}, - {"bar": ("a", 1), "baz": ("b", 2)}, - "6969470da4b829f0a8b665ac78350abd", + { + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1), + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2), + }, + "859ca1926affe9c7d0424030f26fbd89-2--3", ), ( {}, - {"baz": ("b", 1), "bar": ("a", 2)}, - "25f351bbdcfb33f7706f7ef1e80cb010", + { + "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", 1), + "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--2", 2), + }, + "8f8361a286c9a7c3fbfd464e33989037-2--3", ), ( - {"baz": ("a", 1)}, - {"bar": ("b", 2)}, - "a9540738019a48e6392c942217f7526d", + {"baz": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, + {"bar": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2)}, + "3cb139f47d3a3580388f41956c15f55e-2--3", ), ], ) diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py index c3cebf7f..b4bf8af2 100644 --- a/dandischema/digests/zarr.py +++ b/dandischema/digests/zarr.py @@ -2,6 +2,7 @@ from functools import total_ordering import hashlib +import re from typing import Dict, List, Optional, Tuple import pydantic @@ -10,15 +11,28 @@ ENCODING_KWARGS = {"separators": (",", ":")} +def generate_directory_digest(md5: str, file_count: int, size: int): + """Generate a directory digest from its constituent parts""" + return f"{md5}-{file_count}--{size}" + + +def parse_directory_digest(digest: str): + """Parse a directory digest into its constituent parts""" + match = re.match("([0-9a-f]{32})-([0-9]+)--([0-9]+)", digest) + if match is None: + raise ValueError(f"Cannot parse directory digest {digest}") + return match.group(1), int(match.group(2)), int(match.group(3)) + + @total_ordering class ZarrChecksum(pydantic.BaseModel): """ A checksum for a single file/directory in a zarr file. - Every file and directory in a zarr archive has a name and a MD5 hash. + Every file and directory in a zarr archive has a name, digest, and size. """ - md5: str + digest: str name: str size: int @@ -85,19 +99,27 @@ class ZarrChecksumListing(pydantic.BaseModel): """ checksums: ZarrChecksums - md5: str + digest: str size: int class ZarrJSONChecksumSerializer: - def aggregate_checksum(self, checksums: ZarrChecksums) -> str: - """Generate an aggregated checksum for a list of ZarrChecksums.""" + def aggregate_digest(self, checksums: ZarrChecksums) -> str: + """Generate an aggregated digest for a list of ZarrChecksums.""" # Use the most compact separators possible # content = json.dumps([asdict(zarr_md5) for zarr_md5 in checksums], separators=(',', ':'))0 content = checksums.json(**ENCODING_KWARGS) h = hashlib.md5() h.update(content.encode("utf-8")) - return h.hexdigest() + md5 = h.hexdigest() + file_count = sum( + parse_directory_digest(checksum.digest)[1] + for checksum in checksums.directories + ) + len(checksums.files) + size = sum(file.size for file in checksums.files) + sum( + directory.size for directory in checksums.directories + ) + return generate_directory_digest(md5, file_count, size) def serialize(self, zarr_checksum_listing: ZarrChecksumListing) -> str: """Serialize a ZarrChecksumListing into a string.""" @@ -127,20 +149,18 @@ def generate_listing( files=sorted(files) if files is not None else [], directories=sorted(directories) if directories is not None else [], ) - size = sum(file.size for file in checksums.files) + sum( - directory.size for directory in checksums.directories - ) + digest = self.aggregate_digest(checksums) return ZarrChecksumListing( checksums=checksums, - md5=self.aggregate_checksum(checksums), - size=size, + digest=digest, + size=parse_directory_digest(digest)[2], ) # We do not store a checksum file for empty directories since an empty directory doesn't exist in # S3. However, an empty zarr file still needs to have a checksum, even if it has no checksum file. # For convenience, we define this constant as the "null" checksum. -EMPTY_CHECKSUM = ZarrJSONChecksumSerializer().generate_listing(ZarrChecksums()).md5 +EMPTY_CHECKSUM = ZarrJSONChecksumSerializer().generate_listing(ZarrChecksums()).digest def get_checksum( @@ -151,12 +171,12 @@ def get_checksum( raise ValueError("Cannot compute a Zarr checksum for an empty directory") checksum_listing = ZarrJSONChecksumSerializer().generate_listing( files=[ - ZarrChecksum(md5=md5, name=name, size=size) - for name, (md5, size) in files.items() + ZarrChecksum(digest=digest, name=name, size=size) + for name, (digest, size) in files.items() ], directories=[ - ZarrChecksum(md5=md5, name=name, size=size) - for name, (md5, size) in directories.items() + ZarrChecksum(digest=digest, name=name, size=size) + for name, (digest, size) in directories.items() ], ) - return checksum_listing.md5 + return checksum_listing.digest