dandi · dchiquito · Mar 16, 2022 · Mar 2, 2022 · Mar 2, 2022 · Mar 2, 2022
diff --git a/dandischema/digests/tests/test_zarr.py b/dandischema/digests/tests/test_zarr.py
@@ -13,8 +13,8 @@
 
 def test_zarr_checksum_sort_order():
     # The a < b in the path should take precedence over z > y in the md5
-    a = ZarrChecksum(path="1/2/3/a/z", md5="z")
-    b = ZarrChecksum(path="1/2/3/b/z", md5="y")
+    a = ZarrChecksum(name="a", md5="z", size=1)
+    b = ZarrChecksum(name="b", md5="y", size=1)
     assert sorted([b, a]) == [a, b]
 
 
@@ -24,16 +24,16 @@ def test_zarr_checksum_sort_order():
 def test_zarr_checkums_is_empty():
     assert ZarrChecksums(directories=[], files=[]).is_empty
     assert not ZarrChecksums(
-        directories=[ZarrChecksum(md5="md5", path="path")], files=[]
+        directories=[ZarrChecksum(md5="md5", name="name", size=1)], files=[]
     ).is_empty
     assert not ZarrChecksums(
-        directories=[], files=[ZarrChecksum(md5="md5", path="path")]
+        directories=[], files=[ZarrChecksum(md5="md5", name="name", size=1)]
     ).is_empty
 
 
-a = ZarrChecksum(path="a", md5="a")
-b = ZarrChecksum(path="b", md5="b")
-c = ZarrChecksum(path="c", md5="c")
+a = ZarrChecksum(name="a", md5="a", size=1)
+b = ZarrChecksum(name="b", md5="b", size=1)
+c = ZarrChecksum(name="c", md5="c", size=1)
 
 
 @pytest.mark.parametrize(
@@ -113,35 +113,35 @@ def test_zarr_checkums_remove_checksums(
     [
         ([], [], "481a2f77ab786a0f45aafd5db0971caa"),
         (
-            [ZarrChecksum(path="foo/bar", md5="a")],
+            [ZarrChecksum(name="bar", md5="a", size=1)],
             [],
-            "cdcfdfca3622e20df03219273872549e",
+            "677dddd9af150be166c461acdef1b025",
         ),
         (
             [],
-            [ZarrChecksum(path="foo/bar", md5="a")],
-            "243aca82c6872222747183dd738b6fcb",
+            [ZarrChecksum(name="bar", md5="a", size=1)],
+            "aa776d184c64cbd6a5956ab0af012830",
         ),
         (
             [
-                ZarrChecksum(path="foo/bar", md5="a"),
-                ZarrChecksum(path="foo/baz", md5="b"),
+                ZarrChecksum(name="bar", md5="a", size=1),
+                ZarrChecksum(name="baz", md5="b", size=1),
             ],
             [],
-            "785295076ae9156b363e442ef6d485e0",
+            "c8a9b1dd53bb43ec6e5d379c29a1f1dd",
         ),
         (
             [],
             [
-                ZarrChecksum(path="foo/bar", md5="a"),
-                ZarrChecksum(path="foo/baz", md5="b"),
+                ZarrChecksum(name="bar", md5="a", size=1),
+                ZarrChecksum(name="baz", md5="b", size=1),
             ],
-            "ebca8bb8e716237e0f71657d1045930f",
+            "f45aa3833a2129628a38e421f74ff792",
         ),
         (
-            [ZarrChecksum(path="foo/baz", md5="a")],
-            [ZarrChecksum(path="foo/bar", md5="b")],
-            "9c34644ba03b7e9f58ebd1caef4215ad",
+            [ZarrChecksum(name="baz", md5="a", size=1)],
+            [ZarrChecksum(name="bar", md5="b", size=1)],
+            "bc0a0e85a0205eb3cb5f163f173774e5",
         ),
     ],
 )
@@ -160,11 +160,13 @@ def test_zarr_checksum_serializer_aggregate_checksum(
 def test_zarr_checksum_serializer_generate_listing():
     serializer = ZarrJSONChecksumSerializer()
     checksums = ZarrChecksums(
-        files=[ZarrChecksum(path="foo/bar", md5="a")],
-        directories=[ZarrChecksum(path="foo/baz", md5="b")],
+        files=[ZarrChecksum(name="bar", md5="a", size=1)],
+        directories=[ZarrChecksum(name="baz", md5="b", size=2)],
     )
     assert serializer.generate_listing(checksums) == ZarrChecksumListing(
-        checksums=checksums, md5="23076057c0da63f8ab50d0a108db332c"
+        checksums=checksums,
+        md5="c20479b1afe558a919eac450028a706e",
+        size=3,
     )
 
 
@@ -174,61 +176,63 @@ def test_zarr_serialize():
         serializer.serialize(
             ZarrChecksumListing(
                 checksums=ZarrChecksums(
-                    files=[ZarrChecksum(path="foo/bar", md5="a")],
-                    directories=[ZarrChecksum(path="bar/foo", md5="b")],
+                    files=[ZarrChecksum(name="bar", md5="a", size=1)],
+                    directories=[ZarrChecksum(name="foo", md5="b", size=2)],
                 ),
                 md5="c",
+                size=3,
             )
         )
-        == '{"checksums":{"directories":[{"md5":"b","path":"bar/foo"}],"files":[{"md5":"a","path":"foo/bar"}]},"md5":"c"}'  # noqa: E501
+        == '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}'  # noqa: E501
     )
 
 
 def test_zarr_deserialize():
     serializer = ZarrJSONChecksumSerializer()
     assert serializer.deserialize(
-        '{"checksums":{"directories":[{"md5":"b","path":"bar/foo"}],"files":[{"md5":"a","path":"foo/bar"}]},"md5":"c"}'  # noqa: E501
+        '{"checksums":{"directories":[{"md5":"b","name":"foo","size":2}],"files":[{"md5":"a","name":"bar","size":1}]},"md5":"c","size":3}'  # noqa: E501
     ) == ZarrChecksumListing(
         checksums=ZarrChecksums(
-            files=[ZarrChecksum(path="foo/bar", md5="a")],
-            directories=[ZarrChecksum(path="bar/foo", md5="b")],
+            files=[ZarrChecksum(name="bar", md5="a", size=1)],
+            directories=[ZarrChecksum(name="foo", md5="b", size=2)],
         ),
         md5="c",
+        size=3,
     )
 
 
 @pytest.mark.parametrize(
     "files,directories,checksum",
     [
         (
-            {"foo/bar": "a"},
+            {"bar": ("a", 1)},
             {},
-            "cdcfdfca3622e20df03219273872549e",
+            "677dddd9af150be166c461acdef1b025",
         ),
         (
             {},
-            {"foo/bar": "a"},
-            "243aca82c6872222747183dd738b6fcb",
+            {"bar": ("a", 1)},
+            "aa776d184c64cbd6a5956ab0af012830",
         ),
         (
-            {"foo/bar": "a", "foo/baz": "b"},
+            {"bar": ("a", 1), "baz": ("b", 2)},
             {},
-            "785295076ae9156b363e442ef6d485e0",
+            "66c03ae00824e6be1283cc370969f6ea",
         ),
         (
             {},
-            {"foo/bar": "a", "foo/baz": "b"},
-            "ebca8bb8e716237e0f71657d1045930f",
+            {"bar": ("a", 1), "baz": ("b", 2)},
+            "6969470da4b829f0a8b665ac78350abd",
         ),
         (
             {},
-            {"foo/baz": "b", "foo/bar": "a"},
-            "ebca8bb8e716237e0f71657d1045930f",
+            {"baz": ("b", 1), "bar": ("a", 2)},
+            "25f351bbdcfb33f7706f7ef1e80cb010",
         ),
         (
-            {"foo/baz": "a"},
-            {"foo/bar": "b"},
-            "9c34644ba03b7e9f58ebd1caef4215ad",
+            {"baz": ("a", 1)},
+            {"bar": ("b", 2)},
+            "a9540738019a48e6392c942217f7526d",
         ),
     ],
 )

diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py
@@ -2,7 +2,7 @@
 
 from functools import total_ordering
 import hashlib
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Tuple
 
 import pydantic
 
@@ -15,15 +15,16 @@ class ZarrChecksum(pydantic.BaseModel):
     """
     A checksum for a single file/directory in a zarr file.
 
-    Every file and directory in a zarr archive has a path and a MD5 hash.
+    Every file and directory in a zarr archive has a name and a MD5 hash.
     """
 
     md5: str
-    path: str
+    name: str
+    size: int
 
     # To make ZarrChecksums sortable
     def __lt__(self, other: ZarrChecksum):
-        return self.path < other.path
+        return self.name < other.name
 
 
 class ZarrChecksums(pydantic.BaseModel):
@@ -43,7 +44,7 @@ def is_empty(self):
     def _index(self, checksums: List[ZarrChecksum], checksum: ZarrChecksum):
         # O(n) performance, consider using the bisect module or an ordered dict for optimization
         for i in range(0, len(checksums)):
-            if checksums[i].path == checksum.path:
+            if checksums[i].name == checksum.name:
                 return i
         raise ValueError("Not found")
 
@@ -66,13 +67,13 @@ def add_directory_checksums(self, checksums: List[ZarrChecksum]):
                 self.directories.append(new_checksum)
         self.directories = sorted(self.directories)
 
-    def remove_checksums(self, paths: List[str]):
-        """Remove a list of paths from the listing."""
+    def remove_checksums(self, names: List[str]):
+        """Remove a list of names from the listing."""
         self.files = sorted(
-            filter(lambda checksum: checksum.path not in paths, self.files)
+            filter(lambda checksum: checksum.name not in names, self.files)
         )
         self.directories = sorted(
-            filter(lambda checksum: checksum.path not in paths, self.directories)
+            filter(lambda checksum: checksum.name not in names, self.directories)
         )
 
 
@@ -85,6 +86,7 @@ class ZarrChecksumListing(pydantic.BaseModel):
 
     checksums: ZarrChecksums
     md5: str
+    size: int
 
 
 class ZarrJSONChecksumSerializer:
@@ -125,9 +127,13 @@ def generate_listing(
                 files=sorted(files) if files is not None else [],
                 directories=sorted(directories) if directories is not None else [],
             )
+        size = sum(file.size for file in checksums.files) + sum(
+            directory.size for directory in checksums.directories
+        )
         return ZarrChecksumListing(
             checksums=checksums,
             md5=self.aggregate_checksum(checksums),
+            size=size,
         )
 
 
@@ -137,14 +143,20 @@ def generate_listing(
 EMPTY_CHECKSUM = ZarrJSONChecksumSerializer().generate_listing(ZarrChecksums()).md5
 
 
-def get_checksum(files: Dict[str, str], directories: Dict[str, str]) -> str:
+def get_checksum(
+    files: Dict[str, Tuple[str, int]], directories: Dict[str, Tuple[str, int]]
+) -> str:
     """Calculate the checksum of a directory."""
     if not files and not directories:
         raise ValueError("Cannot compute a Zarr checksum for an empty directory")
     checksum_listing = ZarrJSONChecksumSerializer().generate_listing(
-        files=[ZarrChecksum(md5=md5, path=path) for path, md5 in files.items()],
+        files=[
+            ZarrChecksum(md5=md5, name=name, size=size)
+            for name, (md5, size) in files.items()
+        ],
         directories=[
-            ZarrChecksum(md5=md5, path=path) for path, md5 in directories.items()
+            ZarrChecksum(md5=md5, name=name, size=size)
+            for name, (md5, size) in directories.items()
         ],
     )
     return checksum_listing.md5