diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py index b4bf8af2..d887d6d5 100644 --- a/dandischema/digests/zarr.py +++ b/dandischema/digests/zarr.py @@ -9,6 +9,7 @@ """Passed to the json() method of pydantic models for serialization.""" ENCODING_KWARGS = {"separators": (",", ":")} +ZARR_CHECKSUM_PATTERN = "([0-9a-f]{32})-([0-9]+)--([0-9]+)" def generate_directory_digest(md5: str, file_count: int, size: int): @@ -18,7 +19,7 @@ def generate_directory_digest(md5: str, file_count: int, size: int): def parse_directory_digest(digest: str): """Parse a directory digest into its constituent parts""" - match = re.match("([0-9a-f]{32})-([0-9]+)--([0-9]+)", digest) + match = re.match(ZARR_CHECKSUM_PATTERN, digest) if match is None: raise ValueError(f"Cannot parse directory digest {digest}") return match.group(1), int(match.group(2)), int(match.group(3)) diff --git a/dandischema/models.py b/dandischema/models.py index b791504c..0f16dd63 100644 --- a/dandischema/models.py +++ b/dandischema/models.py @@ -22,6 +22,7 @@ from .consts import DANDI_SCHEMA_VERSION from .digests.dandietag import DandiETag +from .digests.zarr import ZARR_CHECKSUM_PATTERN, parse_directory_digest from .model_types import ( AccessTypeDict, AgeReferenceTypeDict, @@ -1124,11 +1125,17 @@ def digest_check(cls, v, values, **kwargs): if v.get(DigestType.dandi_etag): raise ValueError("Digest cannot have both etag and zarr checksums.") digest = v.get(DigestType.dandi_zarr_checksum) - if not re.fullmatch(MD5_PATTERN, digest): + if not re.fullmatch(ZARR_CHECKSUM_PATTERN, digest): raise ValueError( f"Digest must have an appropriate dandi-zarr-checksum value. " f"Got {digest}" ) + _checksum, _file_count, zarr_size = parse_directory_digest(digest) + content_size = values.get("contentSize") + if content_size != zarr_size: + raise ValueError( + f"contentSize {content_size} is not equal to the checksum size {zarr_size}." + ) else: if DigestType.dandi_etag not in v: raise ValueError("A non-zarr asset must have a dandi-etag.") diff --git a/dandischema/tests/test_models.py b/dandischema/tests/test_models.py index 44f8ed8d..41aaf6a1 100644 --- a/dandischema/tests/test_models.py +++ b/dandischema/tests/test_models.py @@ -118,7 +118,7 @@ def test_asset_digest(): for el in exc.value.errors() ) - digest = 33 * "a" + digest = 32 * "a" digest_model = {models.DigestType.dandi_zarr_checksum: digest} with pytest.raises(pydantic.ValidationError) as exc: models.BareAsset( @@ -133,14 +133,23 @@ def test_asset_digest(): for val in set([el["msg"] for el in exc.value.errors()]) ] ) - digest = 32 * "a" + digest = f"{32 * 'a'}-1--42" digest_model = {models.DigestType.dandi_zarr_checksum: digest} - models.BareAsset( - contentSize=100, - encodingFormat="application/x-zarr", - digest=digest_model, - path="/", + with pytest.raises(pydantic.ValidationError) as exc: + models.BareAsset( + contentSize=100, + encodingFormat="application/x-zarr", + digest=digest_model, + path="/", + ) + assert any( + [ + "contentSize 100 is not equal to the checksum size 42." in val + for val in set([el["msg"] for el in exc.value.errors()]) + ] ) + digest = f"{32 * 'a'}-1--100" + digest_model = {models.DigestType.dandi_zarr_checksum: digest} with pytest.raises(pydantic.ValidationError) as exc: models.PublishedAsset( contentSize=100, @@ -357,7 +366,7 @@ def test_autogenerated_titles(): def test_dantimeta_1(): - """ checking basic metadata for publishing""" + """checking basic metadata for publishing""" # meta data without doi, datePublished and publishedBy meta_dict = { "identifier": "DANDI:999999",