Skip to content

Commit

Permalink
Update for change in Zarr checksum format
Browse files Browse the repository at this point in the history
  • Loading branch information
jwodder committed Mar 2, 2022
1 parent a1a7fd5 commit ef00876
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 49 deletions.
8 changes: 3 additions & 5 deletions dandi/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,9 +630,7 @@ def get_digest(self) -> Digest:
it is a file, it will be MD5.
"""
if self.is_dir():
return Digest.dandi_zarr(
get_zarr_checksum(self.filepath, basepath=self.zarr_basepath)
)
return Digest.dandi_zarr(get_zarr_checksum(self.filepath))
else:
return Digest(
algorithm=DigestType.md5, value=get_digest(self.filepath, "md5")
Expand Down Expand Up @@ -695,11 +693,11 @@ def dirstat(dirpath: LocalZarrEntry) -> ZarrStat:
if p.is_dir():
st = dirstat(p)
size += st.size
dir_md5s[str(p)] = st.digest.value
dir_md5s[str(p)] = (st.digest.value, st.size)
files.extend(st.files)
else:
size += p.size
file_md5s[str(p)] = md5file_nocache(p.filepath)
file_md5s[str(p)] = (md5file_nocache(p.filepath), p.size)
files.append(p)
return ZarrStat(
size=size,
Expand Down
62 changes: 23 additions & 39 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from dataclasses import dataclass, field
import hashlib
import logging
import os.path
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union, cast

Expand Down Expand Up @@ -100,54 +101,32 @@ def get_dandietag(filepath: Union[str, Path]) -> DandiETag:
return DandiETag.from_file(filepath)


def get_zarr_checksum(
path: Path,
basepath: Optional[Path] = None,
known: Optional[Dict[str, str]] = None,
) -> str:
def get_zarr_checksum(path: Path, known: Optional[Dict[str, str]] = None) -> str:
"""
Compute the Zarr checksum for a file or directory tree. The checksum for a
subdirectory of a Zarr can be computed by setting ``path`` to the path to
the subdirectory and setting ``basepath`` to the path to the root of the
Zarr.
Compute the Zarr checksum for a file or directory tree.
If the digests for any files in the Zarr are already known, they can be
passed in the ``known`` argument, which must be a `dict` mapping
slash-separated paths relative to the root of the Zarr to hex digests.
"""
if path.is_file():
return cast(str, get_digest(path, "md5"))
root: Tuple[str, ...]
if basepath is None:
basepath = path
root = ()
else:
root = path.relative_to(basepath).parts
if known is None:
known = {}

def digest_file(f: Path) -> Tuple[Path, str]:
assert basepath is not None
assert known is not None
relpath = f.relative_to(basepath).as_posix()
relpath = f.relative_to(path).as_posix()
try:
dgst = known[relpath]
except KeyError:
dgst = md5file_nocache(f)
return (f, dgst)
return (f, dgst, os.path.getsize(f))

zcc = ZCDirectory(path="")
for p, digest in threaded_walk(path, digest_file):
zcc.add(p.relative_to(basepath), digest)
for d in root:
try:
sub = zcc.children[d]
except KeyError:
raise ValueError("Cannot compute a Zarr checksum for an empty directory")
else:
assert isinstance(sub, ZCDirectory)
zcc = sub
return zcc.get_digest()
for p, (digest, size) in threaded_walk(path, digest_file):
zcc.add(p.relative_to(path), digest, size)
return zcc.get_digest_size()[0]


@dataclass
Expand All @@ -159,8 +138,11 @@ class ZCFile:
:meta private:
"""

path: str
digest: str
size: int

def get_digest_size(self) -> Tuple[str, int]:
return (self.digest, self.size)


@dataclass
Expand All @@ -172,32 +154,34 @@ class ZCDirectory:
:meta private:
"""

path: str
children: Dict[str, Union[ZCDirectory, ZCFile]] = field(default_factory=dict)

def get_digest(self) -> str:
def get_digest_size(self) -> Tuple[str, int]:
size = 0
files = {}
dirs = {}
for n in self.children.values():
for name, n in self.children.items():
dgst, sz = n.get_digest_size()
if isinstance(n, ZCDirectory):
dirs[n.path] = n.get_digest()
dirs[name] = (dgst, sz)
else:
files[n.path] = n.digest
return cast(str, get_checksum(files, dirs))
files[name] = (dgst, sz)
size += sz
return (cast(str, get_checksum(files, dirs)), size)

def add(self, path: Path, digest: str) -> None:
def add(self, path: Path, digest: str, size: int) -> None:
*dirs, name = path.parts
parts = []
d = self
for dirname in dirs:
parts.append(dirname)
e = d.children.setdefault(dirname, ZCDirectory(path="/".join(parts)))
e = d.children.setdefault(dirname, ZCDirectory())
assert isinstance(e, ZCDirectory), f"Path type conflict for {d.path}"
d = e
parts.append(name)
pstr = "/".join(parts)
assert name not in d.children, f"File {pstr} encountered twice"
d.children[name] = ZCFile(path=pstr, digest=digest)
d.children[name] = ZCFile(digest=digest, size=size)


def md5file_nocache(filepath: Union[str, Path]) -> str:
Expand Down
6 changes: 2 additions & 4 deletions dandi/support/tests/test_digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,10 @@ def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None:
get_zarr_checksum(tmp_path / "file1.txt") == "d0aa42f003e36c1ecaf9aa8f20b6f1ad"
)
assert get_zarr_checksum(tmp_path) == "e432031edb56d48fa9d9b205689db55e"
assert (
get_zarr_checksum(sub1, basepath=tmp_path) == "4cc960e6c5a46e4dae426124ec2f65c6"
)
assert get_zarr_checksum(sub1) == "4cc960e6c5a46e4dae426124ec2f65c6"

with pytest.raises(ValueError) as excinfo:
get_zarr_checksum(empty, basepath=tmp_path)
get_zarr_checksum(empty)
assert str(excinfo.value) == "Cannot compute a Zarr checksum for an empty directory"

spy = mocker.spy(digests, "md5file_nocache")
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ install_requires =
appdirs
click
click-didyoumean
dandischema ~= 0.5.1
dandischema ~= 0.6.0
etelemetry >= 0.2.2
fasteners
fscacher
Expand Down

0 comments on commit ef00876

Please sign in to comment.