Skip to content

Commit

Permalink
refactor: prefix doi management commands
Browse files Browse the repository at this point in the history
- prefix all one-off destructive DOI commands with `doi_`
- add reset_staging to mint new DOIs on staging using the datacite
  sandbox, doi_reset_staging -> step 3, doi_mint_parent_codebase_dois
- bump deps for datacite schema 4.5 and django cve
  • Loading branch information
alee committed Nov 27, 2024
1 parent 9c2678f commit 543030f
Show file tree
Hide file tree
Showing 17 changed files with 778 additions and 508 deletions.
177 changes: 98 additions & 79 deletions django/library/doi.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import logging
import re
import time
import threading
import queue
import re
import requests
import threading
import time

from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
Expand All @@ -18,7 +18,7 @@
DataCiteRegistrationLog,
)

from datacite import DataCiteRESTClient, schema43
from datacite import DataCiteRESTClient, schema45
from datacite.errors import (
DataCiteError,
DataCiteNoContentError,
Expand All @@ -37,7 +37,7 @@
IS_STAGING = settings.DEPLOY_ENVIRONMENT.is_staging
IS_PRODUCTION = settings.DEPLOY_ENVIRONMENT.is_production

# prefix is different for (dev & staging) and production environments
# prefix differs across (dev + staging) and production
DATACITE_PREFIX = settings.DATACITE_PREFIX

MAX_DATACITE_API_WORKERS = 25
Expand Down Expand Up @@ -183,8 +183,12 @@ def _datacite_heartbeat_url(self):

def _validate_metadata(self, datacite_metadata: DataCiteSchema):
metadata_dict = datacite_metadata.to_dict()
if not schema43.validate(metadata_dict):
logger.error("Invalid DataCite metadata: %s", metadata_dict)
try:
schema45.validator.validate(metadata_dict)
except Exception as e:
logger.error(
"Invalid DataCite metadata: %s", schema45.tostring(metadata_dict), e
)
raise DataCiteError(f"Invalid DataCite metadata: {metadata_dict}")
return datacite_metadata, metadata_dict

Expand All @@ -202,17 +206,22 @@ def mint_public_doi(self, codebase_or_release: Codebase | CodebaseRelease):
return "XX.DRYXX/XXXX-XRUN", True
if hasattr(codebase_or_release, "datacite"):
del codebase_or_release.datacite
datacite_metadata, metadata_dict = self._validate_metadata(
codebase_or_release.datacite
)

doi = "Unassigned"
http_status = 200
message = "Minted new DOI successfully."

datacite_metadata = codebase_or_release.datacite

try:
datacite_metadata, metadata_dict = self._validate_metadata(
datacite_metadata
)
doi = self.datacite_client.public_doi(
metadata_dict, url=codebase_or_release.permanent_url
)
codebase_or_release.doi = doi
codebase_or_release.save()
except DataCiteError as e:
logger.error(e)
message = str(e)
Expand All @@ -233,9 +242,29 @@ def mint_public_doi(self, codebase_or_release: Codebase | CodebaseRelease):
release=codebase_or_release, action=DataCiteAction.CREATE_RELEASE_DOI
)
self._save_log_record(**log_record_dict)
return doi, http_status == 200
return doi, http_status

@classmethod
def is_metadata_fresh(cls, codebase_or_release: Codebase | CodebaseRelease):
try:
newest_log_entry = DataCiteRegistrationLog.objects.latest_entry(
codebase_or_release
)
# make sure item does not have stale datacite metadata
if hasattr(codebase_or_release, "datacite"):
del codebase_or_release.datacite
return newest_log_entry.metadata_hash == codebase_or_release.datacite.hash()

except DataCiteRegistrationLog.DoesNotExist:
# no logs for this item, metadata is stale
logger.info("No registration logs available for %s", codebase_or_release)

return False

def update_doi_metadata(self, codebase_or_release: Codebase | CodebaseRelease):
if self.is_metadata_fresh(codebase_or_release):
logger.info("No need to update DOI metadata for %s", codebase_or_release)
return True
doi = codebase_or_release.doi
if self.dry_run:
logger.debug("DRY RUN")
Expand Down Expand Up @@ -278,16 +307,10 @@ def update_doi_metadata(self, codebase_or_release: Codebase | CodebaseRelease):
self._save_log_record(**log_record_dict)
return http_status == 200

def mint_new_doi_for_codebase(self, codebase: Codebase) -> str:
return self.mint_public_doi(codebase)

def mint_new_doi_for_release(self, release: CodebaseRelease) -> str:
return self.mint_public_doi(release)

def update_metadata_for_codebase(self, codebase: Codebase) -> bool:
def update_codebase_metadata(self, codebase: Codebase) -> bool:
return self.update_doi_metadata(codebase)

def update_metadata_for_release(self, release: CodebaseRelease) -> bool:
def update_release_metadata(self, release: CodebaseRelease) -> bool:
return self.update_doi_metadata(release)

@staticmethod
Expand Down Expand Up @@ -332,21 +355,21 @@ def _is_deep_inclusive(elem1, elem2):
return True

@staticmethod
def _is_same_metadata(sent_data, received_data):
def is_metadata_equivalent(comses_metadata, datacite_metadata):
"""
Checks if the metadata attributes in the sent_data dictionary are the same as the
corresponding attributes in the received_data dictionary.
Args:
sent_data (dict): The dictionary containing the sent metadata attributes.
received_data (dict): The dictionary containing the received metadata attributes.
comses_metadata (dict): A DataCite-compatible dictionary drawn from CoMSES metadata for a given Codebase or CodebaseRelease.
datacite_metadata (dict): A DataCite delivered dictionary pulled for a given DOI
Returns:
bool: True if all attributes are the same, False otherwise.
"""
# Extract keys (attributes) from both dictionaries
sent_keys = set(sent_data.keys())
received_keys = set(received_data.keys())
sent_keys = set(comses_metadata.keys())
received_keys = set(datacite_metadata.keys())

# Initialize array to store different attributes
different_attributes = []
Expand All @@ -360,19 +383,19 @@ def _is_same_metadata(sent_data, received_data):
# FIXME: this accounts for publicationYear: None or "" sent to DataCite
EMPTY_VALS = [None, 0, "None", "0"]

if sent_data[key] and received_data[key]:
if str(sent_data[key]) != str(received_data[key]):
if comses_metadata[key] and datacite_metadata[key]:
if str(comses_metadata[key]) != str(datacite_metadata[key]):
different_attributes.append(key)
elif not (
sent_data[key] in EMPTY_VALS
and received_data[key] in EMPTY_VALS
comses_metadata[key] in EMPTY_VALS
and datacite_metadata[key] in EMPTY_VALS
):
different_attributes.append(key)
else:
continue

if not DataCiteApi._is_deep_inclusive(
sent_data[key], received_data[key]
comses_metadata[key], datacite_metadata[key]
):
# if sent_data[key] != received_data[key]:

Expand All @@ -386,8 +409,8 @@ def _is_same_metadata(sent_data, received_data):
logger.debug("Some attributes have different values:")
for attr in different_attributes:
logger.debug(
f"Attribute '{attr}':\nSent value - {sent_data[attr]}\n"
f"Received value - {received_data[attr]}\n\n"
f"Attribute '{attr}':\nSent value - {comses_metadata[attr]}\n"
f"Received value - {datacite_metadata[attr]}\n\n"
)
return False
else:
Expand All @@ -398,59 +421,55 @@ def _is_same_metadata(sent_data, received_data):
logger.debug("Missing attributes:", missing_attributes)
return False

def check_metadata(self, item) -> bool:
def get_datacite_metadata(self, doi: str):
"""
Get the metadata for the given DOI.
Args:
doi (str): The DOI for which to get the metadata.
Returns:
dict: The metadata for the given DOI.
"""
return self.datacite_client.get_metadata(doi)

def check_metadata(self, codebase_or_release: Codebase | CodebaseRelease) -> bool:
"""
1. get metadata for item.doi
2. compare if the values match codebase.datacite.metadata
- item: Codebase | CodebaseRelease
"""
if not item.doi:
if self.dry_run:
logger.debug(
"Dry run metadata check for %s", codebase_or_release.datacite.to_dict()
)
return True
if not codebase_or_release.doi:
logger.warning(
"Unnecessary metadata check for non-DOI codebase or release %s",
codebase_or_release,
)
return False
try:
if not self.dry_run:
comses_metadata = item.datacite.to_dict()
datacite_metadata = self.datacite_client.get_metadata(item.doi)
return DataCiteApi._is_same_metadata(comses_metadata, datacite_metadata)
else:
logger.debug(
f"{'Codebase' if isinstance(item, Codebase) else 'CodebaseRelease'} metadata is in sync!"
)
return True
comses_metadata = codebase_or_release.datacite.to_dict()
datacite_metadata = self.get_datacite_metadata(codebase_or_release.doi)
logger.debug(
"comparing datacite metadata\n\n%s\n\nwith comses metadata\n\n%s",
datacite_metadata,
comses_metadata,
)
return DataCiteApi.is_metadata_equivalent(
comses_metadata, datacite_metadata
)
except Exception as e:
logger.error(e)
return False

def threaded_metadata_check(self, items):
def loading_animation(thread):
while thread.is_alive():
print(".", end="", flush=True)
time.sleep(0.1)
print("\n")

def _check_metadata(q: queue.Queue):
with ThreadPoolExecutor(max_workers=MAX_DATACITE_API_WORKERS) as executor:
results = executor.map(
lambda item: (item.pk, self.check_metadata(item)), items
)

q.put(results)

# Create a queue to pass data between threads
result_queue = queue.Queue()

# Start the long-running function in a separate thread
thread = threading.Thread(target=_check_metadata, args=(result_queue,))
thread.start()

# Display the loading animation in the main thread
loading_animation(thread)

# Wait for the long-running function to finish
thread.join()
# Get the results from the queue
results = result_queue.get()
return results
def validate_metadata(self, items):
for item in items:
if item.doi:
yield (item, self.check_metadata(item))

def _save_log_record(
self,
Expand Down Expand Up @@ -524,7 +543,7 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
"""
if not codebase_doi:
# request to DataCite API
codebase_doi = datacite_api.mint_new_doi_for_codebase(codebase)
codebase_doi, status_code = datacite_api.mint_public_doi(codebase)

if not codebase_doi:
logger.error(
Expand All @@ -544,8 +563,8 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
Mint DOI for release
"""
# request to DataCite API
release_doi = datacite_api.mint_new_doi_for_release(release)
if not release_doi:
release_doi, status_code = datacite_api.mint_public_doi(release)
if not release_doi or status_code != 200:
logger.error("Could not mint DOI for release %s. Skipping.", release.pk)
if interactive:
input("Press Enter to continue or CTRL+C to quit...")
Expand All @@ -559,7 +578,7 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
"""
Since a new DOI has been minted for the release, we need to update it's parent's metadata (HasVersion)
"""
ok = datacite_api.update_metadata_for_codebase(codebase)
ok = datacite_api.update_codebase_metadata(codebase)
if not ok:
logger.error("Failed to update metadata for codebase %s", codebase.pk)

Expand All @@ -572,15 +591,15 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
next_release = release.get_next_release()

if previous_release and previous_release.doi:
ok = datacite_api.update_metadata_for_release(previous_release)
ok = datacite_api.update_release_metadata(previous_release)
if not ok:
logger.error(
"Failed to update metadata for previous_release %s",
previous_release.pk,
)

if next_release and next_release.doi:
ok = datacite_api.update_metadata_for_release(next_release)
ok = datacite_api.update_release_metadata(next_release)
if not ok:
logger.error(
"Failed to update metadata for next_release %s", next_release.pk
Expand Down Expand Up @@ -619,7 +638,7 @@ def mint_dois_for_peer_reviewed_releases_without_dois(interactive=True, dry_run=
if invalid_codebases:
logger.error(
"FAILURE: %s Codebases with invalid or missing DOIs: %s",
invalid_codebases.count(),
len(invalid_codebases),
invalid_codebases,
)
else:
Expand Down
Loading

0 comments on commit 543030f

Please sign in to comment.