Skip to content

Commit

Permalink
refactor: simplify doi cleanup management commands
Browse files Browse the repository at this point in the history
  • Loading branch information
alee committed Oct 11, 2024
1 parent 8bbdf46 commit 66d042a
Show file tree
Hide file tree
Showing 11 changed files with 448 additions and 379 deletions.
52 changes: 41 additions & 11 deletions django/core/pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
SORT_BY_FILTERS = defaultdict(
lambda: "Sort by: Relevance", # default sort by relevance
{
"-first_published_at": "Sort by: Publish date: newest",
"first_published_at": "Sort by: Publish date: oldest",
"-last_modified": "Sort by: Recently Modified",
"-first_published_at": "Sort by: Most Recent",
"first_published_at": "Sort by: Earliest",
"-last_modified": "Sort by: Last Modified",
},
)

Expand All @@ -31,11 +31,11 @@ def _to_search_terms(query_params):
@staticmethod
def _to_filter_display_terms(query_params):
"""
Convert query parameters into a list of displayable filter terms.
Convert query parameters into a list of displayable filter terms (replaces underscores withs paces, etc)
Args:
query_params (QueryDict): The query parameters.
Returns:
list: A list of display filter terms.
list: A list of displayable filter terms.
"""
filters = []
for key, values in query_params.lists():
Expand All @@ -47,16 +47,40 @@ def _to_filter_display_terms(query_params):
filters.extend(values)
elif key in ["published_before", "published_after"]:
try:
date = parser.isoparse(values[0]).date()
filters.append(f"{key.replace('_', ' ')} {date.isoformat()}")
publication_date = parser.isoparse(values[0]).date()
filters.append(
f"{key.replace('_', ' ')} {publication_date.isoformat()}"
)
except ValueError:
# FIXME: this default behavior duplicates what we want to do in the else clause below
filters.extend(v.replace("_", " ") for v in values)
else:
filters.extend(v.replace("_", " ") for v in values)
return filters

@classmethod
def limit_page_range(cls, page=1, count=max_result_window, size=page_size):
def limit_page_range(cls, page=1, count=None, size=None):
"""
Limits the page range based on the maximum result window and page size.
This method ensures that the page number and result count do not exceed
the configured maximum result window for Elasticsearch. It also clamps
the page number to a valid range.
Args:
page (int): The current page number. Defaults to 1.
count (int): The total number of results. Defaults to max_result_window.
size (int): The number of results per page. Defaults to page_size.
Returns:
tuple: A tuple containing:
- limited_count (int): The total number of results to be shown in the current page, clamped to max_result_window.
- limited_page_number (int): The clamped page number within the valid range.
"""
if count is None:
count = cls.max_result_window
if size is None:
size = cls.page_size
try:
es_settings = getattr(settings, "WAGTAILSEARCH_BACKENDS", {})
max_result_window = es_settings["default"]["INDEX_SETTINGS"]["settings"][
Expand All @@ -73,13 +97,19 @@ def limit_page_range(cls, page=1, count=max_result_window, size=page_size):
# limit the result count to max_result_window
limited_count = min(count, max_result_window)

# Clamp page to range [1, max_page_number]
# Clamp page to range [1, max_page]
try:
max_page_number = -(-limited_count // size)
limited_page_number = min(max(1, int(page)), max_page_number)
max_page = (limited_count + size - 1) // size
requested_page = max(1, int(page))
limited_page_number = min(requested_page, max_page)
except ValueError:
limited_page_number = 1

logger.debug(
"Clamping count to %s and requested page to %s",
limited_count,
limited_page_number,
)
return limited_count, limited_page_number

def get_paginated_response(self, data):
Expand Down
169 changes: 162 additions & 7 deletions django/library/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,10 +103,7 @@ def get_welcome_message(dry_run: bool):
def doi_matches_pattern(doi: str) -> bool:
# checks if DOI is formatted like this "00.12345/q2xt-rj46"
pattern = re.compile(f"{DATACITE_PREFIX}/[-._;()/:a-zA-Z0-9]+")
if re.match(pattern, doi):
return True
else:
return False
return re.match(pattern, doi)


class DataCiteApi:
Expand All @@ -122,8 +119,10 @@ class DataCiteApi:
dry_run (bool): Flag indicating whether the operations should be performed in dry run mode.
"""

DATACITE_ERRORS_TO_STATUS_CODE = defaultdict(lambda: 500)
DATACITE_ERRORS_TO_STATUS_CODE.update(
# this is only needed right now because incoming DataCiteErrors do not keep track of the status code
# maps datacite error classes to the http status codes that caused them
DATACITE_ERRORS_TO_STATUS_CODE = defaultdict(
lambda: 500,
{
DataCiteNoContentError: 204,
DataCiteBadRequestError: 400,
Expand All @@ -133,7 +132,7 @@ class DataCiteApi:
DataCiteGoneError: 410,
DataCitePreconditionError: 412,
DataCiteServerError: 500,
}
},
)

def __init__(self, dry_run=True):
Expand Down Expand Up @@ -481,3 +480,159 @@ def _save_log_record(
message=message,
metadata_hash=metadata_hash,
)


def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=True):
"""
for ALL peer_reviewed releases without DOIs:
1. Mints DOI for parent codebase, if codebase.doi doesn’t exist.
2. Mints DOI for release.
3. Updates metadata for parent codebase and sibling releases
"""

print(get_welcome_message(dry_run))
datacite_api = DataCiteApi()

# CodebaseRelease.objects.filter(peer_reviewed=True).filter(Q(doi__isnull=True) | Q(doi=""))
peer_reviewed_releases_without_dois = (
CodebaseRelease.objects.reviewed().without_doi()
)

total_peer_reviewed_releases_without_dois = (
peer_reviewed_releases_without_dois.count()
)
logger.info(
"Minting DOIs for %s peer reviewed releases without DOIs",
total_peer_reviewed_releases_without_dois,
)

for i, release in enumerate(peer_reviewed_releases_without_dois):
logger.debug(
"Processing release %s/%s - %s",
i + 1,
total_peer_reviewed_releases_without_dois,
release.pk,
)
if interactive:
input("Press Enter to continue or CTRL+C to quit...")

codebase = release.codebase
codebase_doi = codebase.doi

"""
Mint DOI for codebase(parent) if it doesn't exist.
"""
if not codebase_doi:
# request to DataCite API
codebase_doi = datacite_api.mint_new_doi_for_codebase(codebase)

if not codebase_doi:
logger.error(
"Could not mint DOI for parent codebase %s. Skipping release %s.",
codebase.pk,
release.pk,
)
if interactive:
input("Press Enter to continue or CTRL+C to quit...")
continue

if not dry_run:
codebase.doi = codebase_doi
codebase.save()

"""
Mint DOI for release
"""
# request to DataCite API
release_doi = datacite_api.mint_new_doi_for_release(release)
if not release_doi:
logger.error("Could not mint DOI for release %s. Skipping.", release.pk)
if interactive:
input("Press Enter to continue or CTRL+C to quit...")
continue

if not dry_run:
release.doi = release_doi
release.save()

logger.debug("Updating metadata for parent codebase of release %s", release.pk)
"""
Since a new DOI has been minted for the release, we need to update it's parent's metadata (HasVersion)
"""
ok = datacite_api.update_metadata_for_codebase(codebase)
if not ok:
logger.error("Failed to update metadata for codebase %s", codebase.pk)

"""
Since a new DOI has been minted for the release, we need to update its siblings' metadata (isNewVersionOf, isPreviousVersionOf)
"""
logger.debug("Updating metadata for sibling releases of release %s", release.pk)

previous_release = release.get_previous_release()
next_release = release.get_next_release()

if previous_release and previous_release.doi:
ok = datacite_api.update_metadata_for_release(previous_release)
if not ok:
logger.error(
"Failed to update metadata for previous_release %s",
previous_release.pk,
)

if next_release and next_release.doi:
ok = datacite_api.update_metadata_for_release(next_release)
if not ok:
logger.error(
"Failed to update metadata for next_release %s", next_release.pk
)

logger.info(
"Minted %s DOIs for peer reviewed releases without DOIs.",
total_peer_reviewed_releases_without_dois,
)

"""
assert correctness
"""
if not dry_run:
print(VERIFICATION_MESSAGE)
logger.info(
"Verifying: all peer reviewed releases without DOIs and their parent codebases have valid DOIs"
)
invalid_codebases = []
invalid_releases = []

for i, release in enumerate(peer_reviewed_releases_without_dois):
logger.debug(
"Verifying release: %s / %s",
i + 1,
total_peer_reviewed_releases_without_dois,
)

if not release.doi or not doi_matches_pattern(release.doi):
invalid_releases.append(release.pk)
if not release.codebase.doi or not doi_matches_pattern(
release.codebase.doi
):
invalid_codebases.append(release.codebase.pk)

if invalid_codebases:
logger.error(
"FAILURE: %s Codebases with invalid or missing DOIs: %s",
invalid_codebases.count(),
invalid_codebases,
)
else:
logger.info(
"Success. All parent codebases for peer reviewed releases previously without DOIs have valid DOIs now."
)
if invalid_releases:
logger.error(
"Failure. %s CodebaseReleases with invalid or missing DOIs: %s",
invalid_releases.count(),
invalid_releases,
)
else:
logger.info(
"Success. All peer reviewed releases previously without DOIs have valid DOIs now."
)
64 changes: 64 additions & 0 deletions django/library/management/commands/clean_peer_reviewed_dois_02.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import logging
from django.core.management.base import BaseCommand
from library.doi import VERIFICATION_MESSAGE, get_welcome_message
from library.models import CodebaseRelease

logger = logging.getLogger(__name__)


def remove_dois_from_unreviewed_releases(interactive=True, dry_run=True):
print(get_welcome_message(dry_run))

unreviewed_releases_with_dois = CodebaseRelease.objects.filter(
peer_reviewed=False, doi__isnull=False
)
total_unreviewed_releases_with_dois = unreviewed_releases_with_dois.count()

logger.info(
"Cleaning up DOIs for %s unreviewed CodebaseReleases with DOIs",
total_unreviewed_releases_with_dois,
)
if interactive:
confirm = input(
"Deleting all DOIs for unreviewed CodebaseReleases. Enter 'DELETE' to continue or CTRL+C to quit: "
)
if confirm.lower() == "delete":
unreviewed_releases_with_dois.update(doi=None)

"""
assert correctness
"""
if not dry_run:
print(VERIFICATION_MESSAGE)
logger.info(
"Checking that DOIs for all not peer reviewed releases have been deleted..."
)
assert (
CodebaseRelease.objects.filter(
peer_reviewed=False, doi__isnull=False
).count()
== 0
)
logger.info(
"All DOIs from not peer_reviewed CodebaseReleases %s with DOIs deleted successfully.",
total_unreviewed_releases_with_dois,
)


class Command(BaseCommand):

def add_arguments(self, parser):
parser.add_argument(
"--interactive",
action="store_true",
help="Wait for user to press enter to continue.",
default=True,
)
parser.add_argument(
"--dry-run", action="store_true", help="Output what would have happened."
)

def handle(self, *args, **options):
interactive = options["interactive"]
dry_run = options["dry_run"]
remove_dois_from_unreviewed_releases(interactive, dry_run)
Loading

0 comments on commit 66d042a

Please sign in to comment.