From d5b9668829d90f13a3f0359d0afa3e28e86acc0a Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Mon, 20 Mar 2023 13:37:33 -0400 Subject: [PATCH] Add management command to re-extract for asset metadata --- .../management/commands/extract_metadata.py | 93 +++++++++++++++++++ setup.py | 2 + 2 files changed, 95 insertions(+) create mode 100644 dandiapi/api/management/commands/extract_metadata.py diff --git a/dandiapi/api/management/commands/extract_metadata.py b/dandiapi/api/management/commands/extract_metadata.py new file mode 100644 index 000000000..9d9ddbdf3 --- /dev/null +++ b/dandiapi/api/management/commands/extract_metadata.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +import logging +from pathlib import Path + +from dandi.dandiapi import RemoteReadableAsset +from dandi.metadata.nwb import nwb2asset +from dandischema.models import get_schema_version +from django.contrib.auth.models import User +import djclick as click +from tqdm import tqdm + +from dandiapi.api.models import Asset, Dandiset, Version +from dandiapi.api.services.asset import change_asset + +logger = logging.getLogger(__name__) + + +@click.group() +def group(): + pass + + +def extract_asset_metadata(asset: Asset, draft_version: Version): + readable_asset = RemoteReadableAsset( + asset.s3_url, size=asset.size, mtime=asset.modified, name=Path(asset.path).name + ) + + if not asset.path.lower().endswith('.nwb'): + logger.info('Asset %s: Not an NWB file, skipping...', asset.path) + return + + new_metadata = nwb2asset(readable_asset, schema_version=get_schema_version()).json_dict() + + # Use dandiset owner, default to some admin user + user = ( + draft_version.dandiset.owners.first() + or User.objects.filter(is_superuser=True, is_staff=True).first() + ) + + # Replace old asset with new asset containing updated metadata + change_asset( + user=user, + asset=asset, + version=draft_version, + new_asset_blob=asset.blob, + new_zarr_archive=asset.zarr, + new_metadata=new_metadata, + ) + + +def extract_dandiset_assets(dandiset: Dandiset): + # Only update assets which do not belong to a published version + assets = dandiset.draft_version.assets.filter( + published=False, path__iendswith='.nwb' + ).select_related('blob', 'zarr') + if not assets: + logger.info('No draft NWB assets found in dandiset %s. Skipping...', dandiset) + return + + for asset in tqdm(assets): + extract_asset_metadata(asset=asset, draft_version=dandiset.draft_version) + + +@group.command(help='Re-extracts the metadata of this asset') +@click.argument('asset_id') +def asset(asset_id: str): + asset = Asset.objects.get(asset_id=asset_id) + draft_versions = asset.versions.filter(version='draft') + if not draft_versions.exists(): + raise click.ClickException( + 'Cannot re-extract metadata of asset that has no associated draft versions.' + ) + + # Re-extract for every draft version + for version in draft_versions: + extract_asset_metadata(asset=asset, draft_version=version) + + +@group.command( + help='Re-extracts the metadata of all assets in the draft version of the provided dandiset' +) +@click.argument('dandiset_id') +def dandiset(dandiset_id: str): + dandiset = Dandiset.objects.get(id=int(dandiset_id)) + extract_dandiset_assets(dandiset) + + +@group.command(name='all', help='Re-extracts the metadata of all assets in all draft versions') +def all_dandisets(): + for dandiset in Dandiset.objects.all(): + logger.info('DANDISET: %s', dandiset.identifier) + extract_dandiset_assets(dandiset) diff --git a/setup.py b/setup.py index e7db9cc0a..7c1023a96 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ include_package_data=True, install_requires=[ 'celery', + 'dandi', # Pin dandischema to exact version to make explicit which schema version is being used 'dandischema==0.10.2', # schema version 0.6.8 'django~=4.1.0', @@ -59,6 +60,7 @@ 'djangorestframework-yaml', 'drf-extensions', 'drf-yasg', + 'fsspec[http]', 'jsonschema', 'boto3[s3]', 'more_itertools',