Skip to content

Commit

Permalink
Add management command to re-extract for asset metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
jjnesbitt committed Sep 24, 2024
1 parent 99e55f4 commit 5c04aa2
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 0 deletions.
92 changes: 92 additions & 0 deletions dandiapi/api/management/commands/extract_metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from __future__ import annotations

import logging
from pathlib import Path

from dandi.dandiapi import RemoteReadableAsset
from dandi.metadata.nwb import nwb2asset
from django.contrib.auth.models import User
import djclick as click
from tqdm import tqdm

from dandiapi.api.models import Asset, Dandiset, Version
from dandiapi.api.services.asset import change_asset

logger = logging.getLogger(__name__)


@click.group()
def group():
pass


def extract_asset_metadata(asset: Asset, draft_version: Version):
readable_asset = RemoteReadableAsset(
asset.s3_url, size=asset.size, mtime=asset.modified, name=Path(asset.path).name
)

if not asset.path.lower().endswith('.nwb'):
logger.info('Asset %s: Not an NWB file, skipping...', asset.path)
return

new_metadata = nwb2asset(readable_asset).json_dict()

# Use dandiset owner, default to some admin user
user = (
draft_version.dandiset.owners.first()
or User.objects.filter(is_superuser=True, is_staff=True).first()
)

# Replace old asset with new asset containing updated metadata
change_asset(
user=user,
asset=asset,
version=draft_version,
new_asset_blob=asset.blob,
new_zarr_archive=asset.zarr,
new_metadata=new_metadata,
)


def extract_dandiset_assets(dandiset: Dandiset):
# Only update assets which do not belong to a published version
assets = dandiset.draft_version.assets.filter(
published=False, path__iendswith='.nwb'
).select_related('blob', 'zarr')
if not assets:
logger.info('No draft NWB assets found in dandiset %s. Skipping...', dandiset)
return

for asset in tqdm(assets):
extract_asset_metadata(asset=asset, draft_version=dandiset.draft_version)


@group.command(help='Re-extracts the metadata of this asset')
@click.argument('asset_id')
def asset(asset_id: str):
asset = Asset.objects.get(asset_id=asset_id)
draft_versions = asset.versions.filter(version='draft')
if not draft_versions.exists():
raise click.ClickException(
'Cannot re-extract metadata of asset that has no associated draft versions.'
)

# Re-extract for every draft version
for version in draft_versions:
extract_asset_metadata(asset=asset, draft_version=version)


@group.command(
help='Re-extracts the metadata of all assets in the draft version of the provided dandiset'
)
@click.argument('dandiset_id')
def dandiset(dandiset_id: str):
dandiset = Dandiset.objects.get(id=int(dandiset_id))
extract_dandiset_assets(dandiset)


@group.command(name='all', help='Re-extracts the metadata of all assets in all draft versions')
def all_dandisets():
for dandiset in Dandiset.objects.all():
logger.info('DANDISET: %s', dandiset.identifier)
extract_dandiset_assets(dandiset)
32 changes: 32 additions & 0 deletions dandiapi/api/services/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
from __future__ import annotations

from pathlib import Path
from typing import TYPE_CHECKING

from celery.utils.log import get_task_logger
from dandi.dandiapi import RemoteReadableAsset
from dandi.metadata.nwb import nwb2asset
import dandischema.exceptions
from dandischema.metadata import aggregate_assets_summary, validate
from django.conf import settings
from django.contrib.auth.models import User
from django.db import transaction
from django.utils import timezone

from dandiapi.api.models import Asset, Version
from dandiapi.api.services.asset import change_asset
from dandiapi.api.services.metadata.exceptions import (
AssetHasBeenPublishedError,
VersionHasBeenPublishedError,
Expand Down Expand Up @@ -180,3 +185,30 @@ def _get_version_validation_result(
version_qs.update(status=Version.Status.VALIDATING)
status, errors = _get_version_validation_result(current_version)
version_qs.update(status=status, validation_errors=errors, modified=timezone.now())


def re_extract_asset_metadata(asset: Asset):
readable_asset = RemoteReadableAsset(
asset.s3_url, size=asset.size, mtime=asset.modified, name=Path(asset.path).name
)

if not (asset.path.endswith('.nwb') or asset.path.endswith('.NWB')):
logger.info('Asset %s: Not an NWB file, skipping...', asset.path)
return

new_metadata = nwb2asset(readable_asset).json_dict()

# Use dandiset owner, default to some admin user
draft_version = asset.versions.get(version='draft')
user = draft_version.dandiset.owners.first()
if user is None:
user = User.objects.filter(is_superuser=True, is_staff=True).first()

change_asset(
user=user,
asset=asset,
version=draft_version,
new_asset_blob=asset.blob,
new_zarr_archive=asset.zarr,
new_metadata=new_metadata,
)
21 changes: 21 additions & 0 deletions dandiapi/api/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from celery import shared_task
from celery.exceptions import SoftTimeLimitExceeded
from celery.utils.log import get_task_logger
from django.conf import settings
from django.contrib.auth.models import User

from dandiapi.api.doi import delete_doi
Expand All @@ -22,6 +23,7 @@
if TYPE_CHECKING:
from uuid import UUID


logger = get_task_logger(__name__)


Expand Down Expand Up @@ -104,3 +106,22 @@ def unembargo_dandiset_task(dandiset_id: int, user_id: int):
except Exception:
send_dandiset_unembargo_failed_message(ds)
raise


@shared_task
def dispatch_assets_to_re_extract():
assets_to_migrate = Asset.objects.prefetch_related('versions').filter(
metadata__schemaVersion__lt=settings.DANDI_SCHEMA_VERSION,
versions__version='draft',
)

for asset in assets_to_migrate:
re_extract_asset_metadata_task.delay(asset_id=asset.id)


@shared_task
def re_extract_asset_metadata_task(asset_id: int):
from dandiapi.api.services.metadata import re_extract_asset_metadata

asset = Asset.objects.get(id=asset_id)
re_extract_asset_metadata(asset=asset)
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
include_package_data=True,
install_requires=[
'celery',
'dandi',
# Pin dandischema to exact version to make explicit which schema version is being used
'dandischema==0.10.2', # schema version 0.6.8
'django~=4.1.0',
Expand All @@ -59,6 +60,7 @@
'djangorestframework-yaml',
'drf-extensions',
'drf-yasg',
'fsspec[http]',
'jsonschema',
'boto3[s3]',
'more_itertools',
Expand Down

0 comments on commit 5c04aa2

Please sign in to comment.