Skip to content

Commit

Permalink
Merge pull request #1164 from dandi/ls
Browse files Browse the repository at this point in the history
Tests for `ls` reinstated, underlying function fixed, support for ZARR-BIDS files added.
  • Loading branch information
yarikoptic authored Dec 12, 2022
2 parents a0b0015 + 11c18f5 commit 923d7b0
Show file tree
Hide file tree
Showing 7 changed files with 172 additions and 53 deletions.
17 changes: 15 additions & 2 deletions dandi/cli/cmd_ls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import click

from .base import devel_option, lgr, map_to_click_exceptions
from ..consts import ZARR_EXTENSIONS, metadata_all_fields
from ..dandiarchive import DandisetURL, _dandi_url_parser, parse_dandi_url
from ..misctypes import Digest
from ..utils import is_url
Expand Down Expand Up @@ -92,7 +93,6 @@ def ls(
PYOUTFormatter,
YAMLFormatter,
)
from ..consts import metadata_all_fields

# TODO: avoid
from ..support.pyout import PYOUT_SHORT_NAMES_rev
Expand Down Expand Up @@ -358,7 +358,20 @@ def fn():
digest=Digest.dandi_etag(digest),
).json_dict()
else:
rec = get_metadata(path)
if path.endswith(tuple(ZARR_EXTENSIONS)):
if use_fake_digest:
digest = "0" * 32 + "-0--0"
else:
lgr.info("Calculating digest for %s", path)
digest = get_digest(path, digest="zarr-checksum")
rec = get_metadata(path, Digest.dandi_zarr(digest))
else:
if use_fake_digest:
digest = "0" * 32 + "-1"
else:
lgr.info("Calculating digest for %s", path)
digest = get_digest(path, digest="dandi-etag")
rec = get_metadata(path, Digest.dandi_etag(digest))
except Exception as exc:
_add_exc_error(path, rec, errors, exc)
if flatten:
Expand Down
25 changes: 23 additions & 2 deletions dandi/cli/tests/test_ls.py → dandi/cli/tests/test_cmd_ls.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,37 @@ def load(s):
assert metadata[f] == simple1_nwb_metadata[f]


def test_ls_nwb_file(simple2_nwb):
bids_file_path = "simple2.nwb"
bids_file_path = os.path.join(simple2_nwb, bids_file_path)
r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
assert r.exit_code == 0, r.output
data = yaml_load(r.stdout, "safe")
assert len(data) == 1


@mark.skipif_no_network
@pytest.mark.xfail(reason="https://github.com/dandi/dandi-cli/issues/1097")
def test_ls_bids_file(bids_examples):
bids_file_path = "asl003/sub-Sub1/anat/sub-Sub1_T1w.nii.gz"
bids_file_path = os.path.join(bids_examples, bids_file_path)
r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
assert r.exit_code == 0, r.output
data = yaml_load(r.stdout, "safe")
assert len(data) == 1
assert data[0]["subject_id"] == "Sub1"
assert data[0]["identifier"] == "Sub1"


@mark.skipif_no_network
def test_ls_zarrbids_file(bids_examples):
bids_file_path = (
"micr_SEMzarr/sub-01/ses-01/micr/sub-01_ses-01_sample-A_SPIM.ome.zarr"
)
bids_file_path = os.path.join(bids_examples, bids_file_path)
r = CliRunner().invoke(ls, ["-f", "yaml", bids_file_path])
assert r.exit_code == 0, r.output
data = yaml_load(r.stdout, "safe")
assert len(data) == 1
assert data[0]["identifier"] == "01"


@mark.skipif_no_network
Expand Down
44 changes: 43 additions & 1 deletion dandi/files/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from .bases import GenericAsset, LocalFileAsset, NWBAsset
from .zarr import ZarrAsset
from ..consts import ZARR_MIME_TYPE
from ..metadata import add_common_metadata, prepare_metadata
from ..misctypes import Digest
from ..validate_types import ValidationResult
Expand Down Expand Up @@ -45,6 +46,12 @@ class BIDSDatasetDescriptionAsset(LocalFileAsset):
#: populated by `_validate()`
_asset_metadata: Optional[dict[str, dict[str, Any]]] = None

#: Version of BIDS used for the validation;
#: populated by `_validate()`
#: In future this might be removed and the information included in the
#: BareAsset via dandischema.
_bids_version: Optional[str] = None

#: Threading lock needed in case multiple assets are validated in parallel
#: during upload
_lock: Lock = field(init=False, default_factory=Lock, repr=False, compare=False)
Expand All @@ -65,6 +72,22 @@ def _validate(self) -> None:
bids_paths = [str(self.filepath)] + [
str(asset.filepath) for asset in self.dataset_files
]
# This is an ad-hoc fix which should be removed once bidsschematools greater than
# 0.6.0 is released.
# It won't cause any trouble afterwards, but it will no longer fulfill any
# purpose. The issue is that README* is still required and if we don't
# include it explicitly in the listing validation will implicitly fail, even
# if the file is present.
readme_extensions = ["", ".md", ".rst", ".txt"]
for ext in readme_extensions:
readme_candidate = self.bids_root / Path("README" + ext)
if (
readme_candidate.exists()
and str(readme_candidate) not in bids_paths
):
bids_paths += [str(readme_candidate)]
# end of ad-hoc fix.

results = validate_bids(*bids_paths)
self._dataset_errors: list[ValidationResult] = []
self._asset_errors: dict[str, list[ValidationResult]] = defaultdict(
Expand All @@ -74,7 +97,8 @@ def _validate(self) -> None:
for result in results:
if result.id in BIDS_ASSET_ERRORS:
assert result.path
self._asset_errors[str(result.path)].append(result)
bids_path = result.path.relative_to(self.bids_root).as_posix()
self._asset_errors[bids_path].append(result)
elif result.id in BIDS_DATASET_ERRORS:
self._dataset_errors.append(result)
elif result.id == "BIDS.MATCH":
Expand All @@ -84,6 +108,7 @@ def _validate(self) -> None:
self._asset_metadata[bids_path] = prepare_metadata(
result.metadata
)
self._bids_version = result.origin.bids_version

def get_asset_errors(self, asset: BIDSAsset) -> list[ValidationResult]:
""":meta private:"""
Expand Down Expand Up @@ -174,6 +199,11 @@ def get_metadata(
metadata["path"] = self.path
return BareAsset(**metadata)

def get_validation_bids_version(self) -> str:
self.bids_dataset_description._validate()
assert self.bids_dataset_description._bids_version is not None
return self.bids_dataset_description._bids_version


class NWBBIDSAsset(BIDSAsset, NWBAsset):
"""
Expand Down Expand Up @@ -219,6 +249,18 @@ def get_validation_errors(
self, schema_version, devel_debug
) + BIDSAsset.get_validation_errors(self)

def get_metadata(
self,
digest: Optional[Digest] = None,
ignore_errors: bool = True,
) -> BareAsset:
metadata = self.bids_dataset_description.get_asset_metadata(self)
start_time = end_time = datetime.now().astimezone()
add_common_metadata(metadata, self.filepath, start_time, end_time, digest)
metadata["path"] = self.path
metadata["encodingFormat"] = ZARR_MIME_TYPE
return BareAsset(**metadata)


class GenericBIDSAsset(BIDSAsset, GenericAsset):
"""
Expand Down
135 changes: 88 additions & 47 deletions dandi/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import tenacity

from . import __version__, get_logger
from .consts import ZARR_EXTENSIONS, metadata_all_fields
from .dandiset import Dandiset
from .misctypes import Digest
from .pynwb_utils import (
Expand All @@ -38,14 +39,22 @@
metadata_cache,
nwb_has_external_links,
)
from .utils import ensure_datetime, get_mime_type, get_utcnow_datetime
from .utils import (
ensure_datetime,
find_parent_directory_containing,
get_mime_type,
get_utcnow_datetime,
)

lgr = get_logger()


# Disable this for clean hacking
@metadata_cache.memoize_path
def get_metadata(path: Union[str, Path]) -> Optional[dict]:
def get_metadata(
path: Union[str, Path],
digest: Optional[Digest] = None,
) -> Optional[dict]:
"""
Get "flatdata" from a .nwb file or a Dandiset directory
Expand All @@ -59,64 +68,90 @@ def get_metadata(path: Union[str, Path]) -> Optional[dict]:
-------
dict
"""

from .files import bids, dandi_file, find_bids_dataset_description

# when we run in parallel, these annoying warnings appear
ignore_benign_pynwb_warnings()
path = str(path) # for Path
path = os.path.abspath(str(path)) # for Path
meta = dict()

if op.isdir(path):
if op.isdir(path) and not path.endswith(tuple(ZARR_EXTENSIONS)):
try:
dandiset = Dandiset(path)
return cast(dict, dandiset.metadata)
except ValueError as exc:
lgr.debug("Failed to get metadata for %s: %s", path, exc)
return None

if nwb_has_external_links(path):
raise NotImplementedError(
f"NWB files with external links are not supported: {path}"
# Is the data BIDS (as defined by the presence of a BIDS dataset descriptor)
bids_dataset_description = find_bids_dataset_description(path)
if bids_dataset_description:
dandiset_path = find_parent_directory_containing("dandiset.yaml", path)
bids_dataset_description = find_bids_dataset_description(path)
df = dandi_file(
Path(path),
dandiset_path,
bids_dataset_description=bids_dataset_description,
)
path_metadata = df.get_metadata(digest=digest)
assert isinstance(df, bids.BIDSAsset)
meta["bids_version"] = df.get_validation_bids_version()
# there might be a more elegant way to do this:
for key in metadata_all_fields:
try:
value = getattr(path_metadata.wasAttributedTo[0], key)
except AttributeError:
pass
else:
meta[key] = value
elif path.endswith((".NWB", ".nwb")):
if nwb_has_external_links(path):
raise NotImplementedError(
f"NWB files with external links are not supported: {path}"
)

# First read out possibly available versions of specifications for NWB(:N)
meta["nwb_version"] = get_nwb_version(path)

# PyNWB might fail to load because of missing extensions.
# There is a new initiative of establishing registry of such extensions.
# Not yet sure if PyNWB is going to provide "native" support for needed
# functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143
# So meanwhile, hard-coded workaround for data types we care about
ndtypes_registry = {
"AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb",
"ndx-labmetadata-abf": "ndx_dandi_icephys",
}
tried_imports = set()
while True:
try:
meta.update(_get_pynwb_metadata(path))
break
except KeyError as exc: # ATM there is
lgr.debug("Failed to read %s: %s", path, exc)
res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc))
if not res:
raise
ndtype = res.groups()[0]
if ndtype not in ndtypes_registry:
raise ValueError(
"We do not know which extension provides %s. "
"Original exception was: %s. " % (ndtype, exc)
)
import_mod = ndtypes_registry[ndtype]
lgr.debug("Importing %r which should provide %r", import_mod, ndtype)
if import_mod in tried_imports:
raise RuntimeError(
"We already tried importing %s to provide %s, but it seems it didn't help"
% (import_mod, ndtype)
)
tried_imports.add(import_mod)
__import__(import_mod)

meta["nd_types"] = get_neurodata_types(path)

# First read out possibly available versions of specifications for NWB(:N)
meta["nwb_version"] = get_nwb_version(path)

# PyNWB might fail to load because of missing extensions.
# There is a new initiative of establishing registry of such extensions.
# Not yet sure if PyNWB is going to provide "native" support for needed
# functionality: https://github.com/NeurodataWithoutBorders/pynwb/issues/1143
# So meanwhile, hard-coded workaround for data types we care about
ndtypes_registry = {
"AIBS_ecephys": "allensdk.brain_observatory.ecephys.nwb",
"ndx-labmetadata-abf": "ndx_dandi_icephys",
}
tried_imports = set()
while True:
try:
meta.update(_get_pynwb_metadata(path))
break
except KeyError as exc: # ATM there is
lgr.debug("Failed to read %s: %s", path, exc)
res = re.match(r"^['\"\\]+(\S+). not a namespace", str(exc))
if not res:
raise
ndtype = res.groups()[0]
if ndtype not in ndtypes_registry:
raise ValueError(
"We do not know which extension provides %s. "
"Original exception was: %s. " % (ndtype, exc)
)
import_mod = ndtypes_registry[ndtype]
lgr.debug("Importing %r which should provide %r", import_mod, ndtype)
if import_mod in tried_imports:
raise RuntimeError(
"We already tried importing %s to provide %s, but it seems it didn't help"
% (import_mod, ndtype)
)
tried_imports.add(import_mod)
__import__(import_mod)

meta["nd_types"] = get_neurodata_types(path)
else:
raise RuntimeError("Unable to get metadata from non-BIDS, non-NWB asset.")
return meta


Expand Down Expand Up @@ -949,6 +984,12 @@ def add_common_metadata(
"mtime %s of %s is in the future", metadata["blobDateModified"], path
)
metadata["contentSize"] = os.path.getsize(path)
if digest is not None and digest.algorithm is models.DigestType.dandi_zarr_checksum:
m = re.fullmatch(
r"(?P<hash>[0-9a-f]{32})-(?P<files>[0-9]+)--(?P<size>[0-9]+)", digest.value
)
if m:
metadata["contentSize"] = int(m["size"])
metadata.setdefault("wasGeneratedBy", []).append(
get_generator(start_time, end_time)
)
Expand Down
2 changes: 1 addition & 1 deletion dandi/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def test_upload_bids(mocker: MockerFixture, bids_dandiset: SampleDandiset) -> No
# Check existence of assets:
dandiset = bids_dandiset.dandiset
# file we created?
dandiset.get_asset_by_path("CHANGES")
dandiset.get_asset_by_path("README")
# BIDS descriptor file?
dandiset.get_asset_by_path("dataset_description.json")
# actual data file?
Expand Down
1 change: 1 addition & 0 deletions dandi/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def validate_bids(
origin = ValidationOrigin(
name="bidsschematools",
version=bidsschematools.__version__,
bids_version=validation_result["bids_version"],
)

# Storing variable to not re-compute set paths for each individual file.
Expand Down
1 change: 1 addition & 0 deletions dandi/validate_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
class ValidationOrigin:
name: str
version: str
bids_version: Optional[str] = None


class Severity(Enum):
Expand Down

0 comments on commit 923d7b0

Please sign in to comment.