Skip to content

Commit

Permalink
Merge pull request #933 from dandi/schema-120
Browse files Browse the repository at this point in the history
Update for change in Zarr checksum format
  • Loading branch information
yarikoptic authored Mar 22, 2022
2 parents f6d53c9 + 69f9326 commit c32e7f1
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 55 deletions.
2 changes: 1 addition & 1 deletion dandi/cli/tests/test_digest.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ def test_digest_zarr():
)
r = runner.invoke(digest, ["--digest", "zarr-checksum", "sample.zarr"])
assert r.exit_code == 0
assert r.output == "sample.zarr: ebe3432f7ff77791877fa9eac0452831\n"
assert r.output == "sample.zarr: 4313ab36412db2981c3ed391b38604d6-5--1516\n"
8 changes: 3 additions & 5 deletions dandi/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,7 @@ def get_digest(self) -> Digest:
it is a file, it will be MD5.
"""
if self.is_dir():
return Digest.dandi_zarr(
get_zarr_checksum(self.filepath, basepath=self.zarr_basepath)
)
return Digest.dandi_zarr(get_zarr_checksum(self.filepath))
else:
return Digest(
algorithm=DigestType.md5, value=get_digest(self.filepath, "md5")
Expand Down Expand Up @@ -701,11 +699,11 @@ def dirstat(dirpath: LocalZarrEntry) -> ZarrStat:
if p.is_dir():
st = dirstat(p)
size += st.size
dir_md5s[str(p)] = st.digest.value
dir_md5s[str(p)] = (st.digest.value, st.size)
files.extend(st.files)
else:
size += p.size
file_md5s[str(p)] = md5file_nocache(p.filepath)
file_md5s[str(p)] = (md5file_nocache(p.filepath), p.size)
files.append(p)
return ZarrStat(
size=size,
Expand Down
70 changes: 28 additions & 42 deletions dandi/support/digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from dataclasses import dataclass, field
import hashlib
import logging
import os.path
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union, cast

Expand Down Expand Up @@ -100,54 +101,32 @@ def get_dandietag(filepath: Union[str, Path]) -> DandiETag:
return DandiETag.from_file(filepath)


def get_zarr_checksum(
path: Path,
basepath: Optional[Path] = None,
known: Optional[Dict[str, str]] = None,
) -> str:
def get_zarr_checksum(path: Path, known: Optional[Dict[str, str]] = None) -> str:
"""
Compute the Zarr checksum for a file or directory tree. The checksum for a
subdirectory of a Zarr can be computed by setting ``path`` to the path to
the subdirectory and setting ``basepath`` to the path to the root of the
Zarr.
Compute the Zarr checksum for a file or directory tree.
If the digests for any files in the Zarr are already known, they can be
passed in the ``known`` argument, which must be a `dict` mapping
slash-separated paths relative to the root of the Zarr to hex digests.
"""
if path.is_file():
return cast(str, get_digest(path, "md5"))
root: Tuple[str, ...]
if basepath is None:
basepath = path
root = ()
else:
root = path.relative_to(basepath).parts
if known is None:
known = {}

def digest_file(f: Path) -> Tuple[Path, str]:
assert basepath is not None
def digest_file(f: Path) -> Tuple[Path, str, int]:
assert known is not None
relpath = f.relative_to(basepath).as_posix()
relpath = f.relative_to(path).as_posix()
try:
dgst = known[relpath]
except KeyError:
dgst = md5file_nocache(f)
return (f, dgst)
return (f, dgst, os.path.getsize(f))

zcc = ZCDirectory(path="")
for p, digest in threaded_walk(path, digest_file):
zcc.add(p.relative_to(basepath), digest)
for d in root:
try:
sub = zcc.children[d]
except KeyError:
raise ValueError("Cannot compute a Zarr checksum for an empty directory")
else:
assert isinstance(sub, ZCDirectory)
zcc = sub
return zcc.get_digest()
zcc = ZCDirectory()
for p, digest, size in threaded_walk(path, digest_file):
zcc.add(p.relative_to(path), digest, size)
return zcc.get_digest_size()[0]


@dataclass
Expand All @@ -159,8 +138,11 @@ class ZCFile:
:meta private:
"""

path: str
digest: str
size: int

def get_digest_size(self) -> Tuple[str, int]:
return (self.digest, self.size)


@dataclass
Expand All @@ -172,32 +154,36 @@ class ZCDirectory:
:meta private:
"""

path: str
children: Dict[str, Union[ZCDirectory, ZCFile]] = field(default_factory=dict)

def get_digest(self) -> str:
def get_digest_size(self) -> Tuple[str, int]:
size = 0
files = {}
dirs = {}
for n in self.children.values():
for name, n in self.children.items():
dgst, sz = n.get_digest_size()
if isinstance(n, ZCDirectory):
dirs[n.path] = n.get_digest()
dirs[name] = (dgst, sz)
else:
files[n.path] = n.digest
return cast(str, get_checksum(files, dirs))
files[name] = (dgst, sz)
size += sz
return (cast(str, get_checksum(files, dirs)), size)

def add(self, path: Path, digest: str) -> None:
def add(self, path: Path, digest: str, size: int) -> None:
*dirs, name = path.parts
parts = []
d = self
for dirname in dirs:
parts.append(dirname)
e = d.children.setdefault(dirname, ZCDirectory(path="/".join(parts)))
assert isinstance(e, ZCDirectory), f"Path type conflict for {d.path}"
e = d.children.setdefault(dirname, ZCDirectory())
assert isinstance(
e, ZCDirectory
), f"Path type conflict for {'/'.join(parts)}"
d = e
parts.append(name)
pstr = "/".join(parts)
assert name not in d.children, f"File {pstr} encountered twice"
d.children[name] = ZCFile(path=pstr, digest=digest)
d.children[name] = ZCFile(digest=digest, size=size)


def md5file_nocache(filepath: Union[str, Path]) -> str:
Expand Down
10 changes: 4 additions & 6 deletions dandi/support/tests/test_digests.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,11 @@ def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None:
assert (
get_zarr_checksum(tmp_path / "file1.txt") == "d0aa42f003e36c1ecaf9aa8f20b6f1ad"
)
assert get_zarr_checksum(tmp_path) == "e432031edb56d48fa9d9b205689db55e"
assert (
get_zarr_checksum(sub1, basepath=tmp_path) == "4cc960e6c5a46e4dae426124ec2f65c6"
)
assert get_zarr_checksum(tmp_path) == "25627e0fc7c609d10100d020f7782a25-8--197"
assert get_zarr_checksum(sub1) == "64af93ad7f8d471c00044d1ddbd4c0ba-4--97"

with pytest.raises(ValueError) as excinfo:
get_zarr_checksum(empty, basepath=tmp_path)
get_zarr_checksum(empty)
assert str(excinfo.value) == "Cannot compute a Zarr checksum for an empty directory"

spy = mocker.spy(digests, "md5file_nocache")
Expand All @@ -103,6 +101,6 @@ def test_get_zarr_checksum(mocker: MockerFixture, tmp_path: Path) -> None:
# ^^ Not used in calculation!
},
)
== "f67518b9092633027105f9aa9de9259f"
== "f77f4c5b277575f781c19ba91422f0c5-8--197"
)
spy.assert_called_once_with(sub2 / "file7.txt")
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ install_requires =
appdirs
click
click-didyoumean
dandischema ~= 0.5.1
dandischema ~= 0.6.0
etelemetry >= 0.2.2
fasteners
fscacher
Expand Down

0 comments on commit c32e7f1

Please sign in to comment.