quick draft implementation of addressing issue 1. from #10909.

IQSS · Oct 21, 2024 · 6d336c8 · 6d336c8
1 parent d039a10
commit 6d336c8
Show file tree

Hide file tree

Showing 4 changed files with 67 additions and 13 deletions.
diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportGenericServiceBean.java
@@ -150,12 +150,16 @@ public DatasetDTO processXML( XMLStreamReader xmlr, ForeignMetadataFormatMapping
 
     }
 
-    // Helper method for importing harvested Dublin Core xml.
+    // Helper methods for importing harvested Dublin Core xml.
     // Dublin Core is considered a mandatory, built in metadata format mapping. 
     // It is distributed as required content, in reference_data.sql. 
     // Note that arbitrary formatting tags are supported for the outer xml
     // wrapper. -- L.A. 4.5
     public DatasetDTO processOAIDCxml(String DcXmlToParse) throws XMLStreamException {
+        return processOAIDCxml(DcXmlToParse, null);
+    }
+
+    public DatasetDTO processOAIDCxml(String DcXmlToParse, String oaiIdentifier) throws XMLStreamException {
         // look up DC metadata mapping: 
 
         ForeignMetadataFormatMapping dublinCoreMapping = findFormatMappingByName(DCTERMS);
@@ -185,18 +189,37 @@ public DatasetDTO processOAIDCxml(String DcXmlToParse) throws XMLStreamException
 
         datasetDTO.getDatasetVersion().setVersionState(DatasetVersion.VersionState.RELEASED);
 
-        // Our DC import handles the contents of the dc:identifier field 
-        // as an "other id". In the context of OAI harvesting, we expect 
-        // the identifier to be a global id, so we need to rearrange that: 
+        // In some cases, the identifier that we want to use for the dataset is 
+        // already supplied to the method explicitly. For example, in some 
+        // harvesting cases we'll want to use the OAI identifier (the identifier 
+        // from the <header> section of the OAI record) for that purpose, without
+        // expecting to find a valid persistent id in the body of the DC record:
 
-        String identifier = getOtherIdFromDTO(datasetDTO.getDatasetVersion());
-        logger.fine("Imported identifier: "+identifier);
+        String globalIdentifier; 
 
-        String globalIdentifier = reassignIdentifierAsGlobalId(identifier, datasetDTO);
-        logger.fine("Detected global identifier: "+globalIdentifier);
+        if (oaiIdentifier != null) {
+            logger.fine("Attempting to use " + oaiIdentifier + " as the persistentId of the imported dataset");
+
+            globalIdentifier = reassignIdentifierAsGlobalId(oaiIdentifier, datasetDTO);
+        } else {
+            // Our DC import handles the contents of the dc:identifier field 
+            // as an "other id". Unless we are using an externally supplied 
+            // global id, we will be using the first such "other id" that we 
+            // can parse and recognize as the global id for the imported dataset
+            // (note that this is the default behavior during harvesting),
+            // so we need to reaassign it accordingly: 
+            String identifier = getOtherIdFromDTO(datasetDTO.getDatasetVersion());
+            logger.fine("Imported identifier: " + identifier);
+
+            globalIdentifier = reassignIdentifierAsGlobalId(identifier, datasetDTO);
+            logger.fine("Detected global identifier: " + globalIdentifier);
+        }
 
         if (globalIdentifier == null) {
-            throw new EJBException("Failed to find a global identifier in the OAI_DC XML record.");
+            String exceptionMsg = oaiIdentifier == null ? 
+                    "Failed to find a global identifier in the OAI_DC XML record." : 
+                    "Failed to parse the supplied identifier as a valid Persistent Id";
+            throw new EJBException(exceptionMsg);
         }
 
         return datasetDTO;

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -206,7 +206,13 @@ public JsonObjectBuilder handleFile(DataverseRequest dataverseRequest, Dataverse
     }
 
     @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
-    public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, Date oaiDateStamp, PrintWriter cleanupLog) throws ImportException, IOException {
+    public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, 
+            HarvestingClient harvestingClient, 
+            String harvestIdentifier, 
+            String metadataFormat, 
+            File metadataFile, 
+            Date oaiDateStamp, 
+            PrintWriter cleanupLog) throws ImportException, IOException {
         if (harvestingClient == null || harvestingClient.getDataverse() == null) {
             throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient.");
         }
@@ -243,7 +249,10 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             logger.fine("importing DC "+metadataFile.getAbsolutePath());
             try {
                 String xmlToParse = new String(Files.readAllBytes(metadataFile.toPath()));
-                dsDTO = importGenericService.processOAIDCxml(xmlToParse);
+                String suggestedIdentifier = harvestingClient.isUseOaiIdentifiersAsPids() 
+                        ? harvestIdentifier 
+                        : null; 
+                dsDTO = importGenericService.processOAIDCxml(xmlToParse, suggestedIdentifier);
             } catch (IOException | XMLStreamException e) {
                 throw new ImportException("Failed to process Dublin Core XML record: "+ e.getClass() + " (" + e.getMessage() + ")");
             }

diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java
@@ -214,6 +214,7 @@ public void setArchiveDescription(String archiveDescription) {
         this.archiveDescription = archiveDescription; 
     }
 
+    @Column(columnDefinition="TEXT")
     private String harvestingSet;
 
     public String getHarvestingSet() {
@@ -252,8 +253,26 @@ public void setAllowHarvestingMissingCVV(boolean allowHarvestingMissingCVV) {
         this.allowHarvestingMissingCVV = allowHarvestingMissingCVV;
     }
 
-    // TODO: do we need "orphanRemoval=true"? -- L.A. 4.4
-    // TODO: should it be @OrderBy("startTime")? -- L.A. 4.4
+    private Boolean useListRecords; 
+
+    public Boolean isUseListRecords() {
+        return useListRecords; 
+    }
+
+    public void setUseListrecords(boolean useListRecords) {
+        this.useListRecords = useListRecords; 
+    }
+
+    private Boolean useOaiIdAsPid; 
+
+    public Boolean isUseOaiIdentifiersAsPids() {
+        return useOaiIdAsPid; 
+    }
+
+    public void setUseOaiIdentifiersAsPids(boolean useOaiIdAsPid) {
+        this.useOaiIdAsPid = useOaiIdAsPid; 
+    }
+
     @OneToMany(mappedBy="harvestingClient", cascade={CascadeType.REMOVE, CascadeType.MERGE, CascadeType.PERSIST})
     @OrderBy("id")
     private List<ClientHarvestRun> harvestHistory;

diff --git a/src/main/resources/db/migration/V6.4.0.1.sql b/src/main/resources/db/migration/V6.4.0.1.sql
@@ -0,0 +1,3 @@
+-- Add these boolean flags to accommodate new harvesting client features
+ALTER TABLE harvestingclient ADD COLUMN IF NOT EXISTS useOaiIdAsPid BOOLEAN DEFAULT FALSE;
+ALTER TABLE harvestingclient ADD COLUMN IF NOT EXISTS useListRecords BOOLEAN DEFAULT FALSE;