From ad081b357b0de660dfec4595f37caf4389ddc106 Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Thu, 18 Jan 2024 16:02:14 -0500 Subject: [PATCH 1/3] Don't use .save() in validate_version_metadata --- dandiapi/api/services/metadata/__init__.py | 41 +++++++++++----------- dandiapi/api/tests/test_tasks.py | 33 ++++++++++++++++- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/dandiapi/api/services/metadata/__init__.py b/dandiapi/api/services/metadata/__init__.py index 022ee3379..bb4cbba2e 100644 --- a/dandiapi/api/services/metadata/__init__.py +++ b/dandiapi/api/services/metadata/__init__.py @@ -140,43 +140,42 @@ def _build_validatable_version_metadata(version: Version) -> dict: with transaction.atomic(): # validating version metadata needs to lock the version to avoid racing with # other modifications e.g. aggregate_assets_summary. - version = ( - Version.objects.filter(id=version.id, status=Version.Status.PENDING) - .select_for_update() - .first() - ) # It's possible for this version to get deleted during execution of this function. # If that happens *before* the select_for_update query, return early. - if version is None: + version_qs = Version.objects.filter(id=version.id).select_for_update() + if not version_qs.filter(status=Version.Status.PENDING).exists(): logger.info('Version %s no longer exists, skipping validation', version_id) return - version.status = Version.Status.VALIDATING - version.save() - + # Set to validating and continue + version_qs.update(status=Version.Status.VALIDATING) try: validate( - _build_validatable_version_metadata(version), + _build_validatable_version_metadata(version_qs.first()), schema_key='PublishedDandiset', json_validation=True, ) except dandischema.exceptions.ValidationError as e: logger.info('Error while validating version %s', version.id) - version.status = Version.Status.INVALID - - validation_errors = _collect_validation_errors(e) - version.validation_errors = validation_errors - version.save() + version_qs.update( + status=Version.Status.INVALID, + validation_errors=_collect_validation_errors(e), + modified=timezone.now(), + ) return except ValueError as e: # A bare ValueError is thrown when dandischema generates its own exceptions, like a # mismatched schemaVersion. - version.status = Version.Status.INVALID - version.validation_errors = [{'field': '', 'message': str(e)}] - version.save() + logger.info('Error while validating version %s', version.id) + version_qs.update( + status=Version.Status.INVALID, + validation_errors=[{'field': '', 'message': str(e)}], + modified=timezone.now(), + ) return + logger.info('Successfully validated version %s', version.id) - version.status = Version.Status.VALID - version.validation_errors = [] - version.save() + version_qs.update( + status=Version.Status.VALID, validation_errors=[], modified=timezone.now() + ) diff --git a/dandiapi/api/tests/test_tasks.py b/dandiapi/api/tests/test_tasks.py index 15cdea730..80ea1d39e 100644 --- a/dandiapi/api/tests/test_tasks.py +++ b/dandiapi/api/tests/test_tasks.py @@ -5,6 +5,8 @@ from typing import TYPE_CHECKING from django.conf import settings +from django.forms.models import model_to_dict +from django.utils import timezone from guardian.shortcuts import assign_perm import pytest @@ -190,13 +192,42 @@ def test_validate_asset_metadata_saves_version(draft_asset: Asset, draft_version def test_validate_version_metadata(draft_version: Version, asset: Asset): draft_version.assets.add(asset) - tasks.validate_version_metadata_task(draft_version.id) + # Bypass .save to manually set an older timestamp + old_modified = timezone.now() - datetime.timedelta(minutes=10) + updated = Version.objects.filter(id=draft_version.id).update(modified=old_modified) + assert updated == 1 + tasks.validate_version_metadata_task(draft_version.id) draft_version.refresh_from_db() assert draft_version.status == Version.Status.VALID assert draft_version.validation_errors == [] + # Ensure modified field was updated + assert draft_version.modified > old_modified + + +@pytest.mark.django_db() +def test_validate_version_metadata_no_side_effects(draft_version: Version, asset: Asset): + draft_version.assets.add(asset) + + # Set the version `status` and `validation_errors` fields to something + # which can't be a result of validate_version_metadata + draft_version.status = Version.Status.PENDING + draft_version.validation_errors = [{'foo': 'bar'}] + draft_version.save() + + # Validate version metadata, storing the model data before and after + old_data = model_to_dict(draft_version) + tasks.validate_version_metadata_task(draft_version.id) + draft_version.refresh_from_db() + new_data = model_to_dict(draft_version) + + # Check that change is isolated to these two fields + assert old_data.pop('status') != new_data.pop('status') + assert old_data.pop('validation_errors') != new_data.pop('validation_errors') + assert old_data == new_data + @pytest.mark.django_db() def test_validate_version_metadata_malformed_schema_version(draft_version: Version, asset: Asset): From 3b10c890d7725c65ba3bd7a2019ca4e9b1cabcce Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Thu, 18 Jan 2024 16:08:00 -0500 Subject: [PATCH 2/3] Refactor version validation to consolidate updates --- dandiapi/api/services/metadata/__init__.py | 56 ++++++++++------------ 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/dandiapi/api/services/metadata/__init__.py b/dandiapi/api/services/metadata/__init__.py index bb4cbba2e..5049315bf 100644 --- a/dandiapi/api/services/metadata/__init__.py +++ b/dandiapi/api/services/metadata/__init__.py @@ -129,9 +129,28 @@ def _build_validatable_version_metadata(version: Version) -> dict: } return metadata_for_validation - version_id = version.id + def _get_version_validation_result( + version: Version, + ) -> tuple[Version.Status, list[dict[str, str]]]: + try: + validate( + _build_validatable_version_metadata(version), + schema_key='PublishedDandiset', + json_validation=True, + ) + except dandischema.exceptions.ValidationError as e: + logger.info('Error while validating version %s', version.id) + return (Version.Status.INVALID, _collect_validation_errors(e)) + except ValueError as e: + # A bare ValueError is thrown when dandischema generates its own exceptions, like a + # mismatched schemaVersion. + logger.info('Error while validating version %s', version.id) + return (Version.Status.INVALID, [{'field': '', 'message': str(e)}]) + + logger.info('Successfully validated version %s', version.id) + return (Version.Status.VALID, []) - logger.info('Validating dandiset metadata for version %s', version_id) + logger.info('Validating dandiset metadata for version %s', version.id) # Published versions are immutable if version.version != 'draft': @@ -145,37 +164,10 @@ def _build_validatable_version_metadata(version: Version) -> dict: # If that happens *before* the select_for_update query, return early. version_qs = Version.objects.filter(id=version.id).select_for_update() if not version_qs.filter(status=Version.Status.PENDING).exists(): - logger.info('Version %s no longer exists, skipping validation', version_id) + logger.info('Version %s no longer exists, skipping validation', version.id) return # Set to validating and continue version_qs.update(status=Version.Status.VALIDATING) - try: - validate( - _build_validatable_version_metadata(version_qs.first()), - schema_key='PublishedDandiset', - json_validation=True, - ) - except dandischema.exceptions.ValidationError as e: - logger.info('Error while validating version %s', version.id) - version_qs.update( - status=Version.Status.INVALID, - validation_errors=_collect_validation_errors(e), - modified=timezone.now(), - ) - return - except ValueError as e: - # A bare ValueError is thrown when dandischema generates its own exceptions, like a - # mismatched schemaVersion. - logger.info('Error while validating version %s', version.id) - version_qs.update( - status=Version.Status.INVALID, - validation_errors=[{'field': '', 'message': str(e)}], - modified=timezone.now(), - ) - return - - logger.info('Successfully validated version %s', version.id) - version_qs.update( - status=Version.Status.VALID, validation_errors=[], modified=timezone.now() - ) + status, errors = _get_version_validation_result(version_qs.first()) + version_qs.update(status=status, validation_errors=errors, modified=timezone.now()) From 11bfd89ab8b6bc0c6e334a687c1c0f2506b1242c Mon Sep 17 00:00:00 2001 From: Jacob Nesbitt Date: Thu, 18 Jan 2024 16:16:47 -0500 Subject: [PATCH 3/3] Separate behavior for deleted vs modified version --- dandiapi/api/services/metadata/__init__.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/dandiapi/api/services/metadata/__init__.py b/dandiapi/api/services/metadata/__init__.py index 5049315bf..f691b71b2 100644 --- a/dandiapi/api/services/metadata/__init__.py +++ b/dandiapi/api/services/metadata/__init__.py @@ -150,7 +150,8 @@ def _get_version_validation_result( logger.info('Successfully validated version %s', version.id) return (Version.Status.VALID, []) - logger.info('Validating dandiset metadata for version %s', version.id) + version_id = version.id + logger.info('Validating dandiset metadata for version %s', version_id) # Published versions are immutable if version.version != 'draft': @@ -159,15 +160,23 @@ def _get_version_validation_result( with transaction.atomic(): # validating version metadata needs to lock the version to avoid racing with # other modifications e.g. aggregate_assets_summary. + version_qs = Version.objects.filter(id=version_id).select_for_update() + current_version = version_qs.first() # It's possible for this version to get deleted during execution of this function. # If that happens *before* the select_for_update query, return early. - version_qs = Version.objects.filter(id=version.id).select_for_update() - if not version_qs.filter(status=Version.Status.PENDING).exists(): - logger.info('Version %s no longer exists, skipping validation', version.id) + if current_version is None: + logger.info('Version %s no longer exists, skipping validation', version_id) + return + + # If the version has since been modified, return early + if current_version.status != Version.Status.PENDING: + logger.info( + 'Skipping validation for version %s due to concurrent modification', version_id + ) return # Set to validating and continue version_qs.update(status=Version.Status.VALIDATING) - status, errors = _get_version_validation_result(version_qs.first()) + status, errors = _get_version_validation_result(current_version) version_qs.update(status=status, validation_errors=errors, modified=timezone.now())