diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java index e0b5c2dfbfb..8b36e3e311d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvesterServiceBean.java @@ -33,13 +33,16 @@ import org.apache.commons.lang3.mutable.MutableBoolean; import org.xml.sax.SAXException; +import io.gdcc.xoai.model.oaipmh.results.Record; import io.gdcc.xoai.model.oaipmh.results.record.Header; +import io.gdcc.xoai.model.oaipmh.results.record.Metadata; import edu.harvard.iq.dataverse.EjbDataverseEngine; import edu.harvard.iq.dataverse.api.imports.ImportServiceBean; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler; import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandlerException; import edu.harvard.iq.dataverse.search.IndexServiceBean; +import io.gdcc.xoai.xml.XmlWriter; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.InputStream; @@ -53,6 +56,7 @@ import java.nio.file.Path; import jakarta.persistence.EntityManager; import jakarta.persistence.PersistenceContext; +import javax.xml.stream.XMLStreamException; /** * @@ -232,48 +236,104 @@ private void harvestOAI(DataverseRequest dataverseRequest, HarvestingClient harv httpClient = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.ALWAYS).build(); try { - for (Iterator
idIter = oaiHandler.runListIdentifiers(); idIter.hasNext();) { - // Before each iteration, check if this harvesting job needs to be aborted: - if (checkIfStoppingJob(harvestingClient)) { - throw new StopHarvestException("Harvesting stopped by external request"); - } + if (harvestingClient.isUseListRecords()) { + harvestOAIviaListRecords(oaiHandler, dataverseRequest, harvestingClient, httpClient, failedIdentifiers, deletedIdentifiers, harvestedDatasetIds, hdLogger, importCleanupLog); + } else { + // The default behavior is to use ListIdentifiers: + harvestOAIviaListIdentifiers(oaiHandler, dataverseRequest, harvestingClient, httpClient, failedIdentifiers, deletedIdentifiers, harvestedDatasetIds, hdLogger, importCleanupLog); + } + } catch (OaiHandlerException e) { + throw new IOException("Failed to run ListIdentifiers: " + e.getMessage()); + } - Header h = idIter.next(); - String identifier = h.getIdentifier(); - Date dateStamp = Date.from(h.getDatestamp()); - - hdLogger.info("processing identifier: " + identifier + ", date: " + dateStamp); - - if (h.isDeleted()) { - hdLogger.info("Deleting harvesting dataset for " + identifier + ", per ListIdentifiers."); + logCompletedOaiHarvest(hdLogger, harvestingClient); - deleteHarvestedDatasetIfExists(identifier, oaiHandler.getHarvestingClient().getDataverse(), dataverseRequest, deletedIdentifiers, hdLogger); - continue; - } + } + + private void harvestOAIviaListIdentifiers(OaiHandler oaiHandler, DataverseRequest dataverseRequest, HarvestingClient harvestingClient, HttpClient httpClient, List failedIdentifiers, List deletedIdentifiers, List harvestedDatasetIds, Logger harvesterLogger, PrintWriter importCleanupLog) throws OaiHandlerException, StopHarvestException { + for (Iterator
idIter = oaiHandler.runListIdentifiers(); idIter.hasNext();) { + // Before each iteration, check if this harvesting job needs to be aborted: + if (checkIfStoppingJob(harvestingClient)) { + throw new StopHarvestException("Harvesting stopped by external request"); + } - MutableBoolean getRecordErrorOccurred = new MutableBoolean(false); + Header h = idIter.next(); + String identifier = h.getIdentifier(); + Date dateStamp = Date.from(h.getDatestamp()); - // Retrieve and process this record with a separate GetRecord call: - - Long datasetId = processRecord(dataverseRequest, hdLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient); - - if (datasetId != null) { - harvestedDatasetIds.add(datasetId); - } - - if (getRecordErrorOccurred.booleanValue() == true) { - failedIdentifiers.add(identifier); - //can be uncommented out for testing failure handling: - //throw new IOException("Exception occured, stopping harvest"); - } + harvesterLogger.info("ListIdentifiers; processing identifier: " + identifier + ", date: " + dateStamp); + + if (h.isDeleted()) { + harvesterLogger.info("ListIdentifiers; deleting harvesting dataset for " + identifier); + + deleteHarvestedDatasetIfExists(identifier, oaiHandler.getHarvestingClient().getDataverse(), dataverseRequest, deletedIdentifiers, harvesterLogger); + continue; + } + + MutableBoolean getRecordErrorOccurred = new MutableBoolean(false); + + // Retrieve and process this record with a separate GetRecord call: + Long datasetId = processRecord(dataverseRequest, harvesterLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient); + + if (datasetId != null) { + harvestedDatasetIds.add(datasetId); + } + + if (getRecordErrorOccurred.booleanValue() == true) { + failedIdentifiers.add(identifier); + //can be uncommented out for testing failure handling: + //throw new IOException("Exception occured, stopping harvest"); } - } catch (OaiHandlerException e) { - throw new IOException("Failed to run ListIdentifiers: " + e.getMessage()); } + } + + private void harvestOAIviaListRecords(OaiHandler oaiHandler, DataverseRequest dataverseRequest, HarvestingClient harvestingClient, HttpClient httpClient, List failedIdentifiers, List deletedIdentifiers, List harvestedDatasetIds, Logger harvesterLogger, PrintWriter importCleanupLog) throws OaiHandlerException, StopHarvestException { + for (Iterator idIter = oaiHandler.runListRecords(); idIter.hasNext();) { + // Before each iteration, check if this harvesting job needs to be aborted: + if (checkIfStoppingJob(harvestingClient)) { + throw new StopHarvestException("Harvesting stopped by external request"); + } - logCompletedOaiHarvest(hdLogger, harvestingClient); + Record oaiRecord = idIter.next(); + + try { + harvesterLogger.info("record.getMetadata() (via XmlWriter):" + XmlWriter.toString(oaiRecord.getMetadata())); + } catch (XMLStreamException xsx) { + harvesterLogger.info("Caught an XMLStreamException: " + xsx.getMessage()); + } + + + Header h = oaiRecord.getHeader(); + String identifier = h.getIdentifier(); + Date dateStamp = Date.from(h.getDatestamp()); - } + harvesterLogger.info("ListRecords; processing identifier : " + identifier + ", date: " + dateStamp); + + if (h.isDeleted()) { + harvesterLogger.info("ListRecords; Deleting harvested dataset for " + identifier); + + deleteHarvestedDatasetIfExists(identifier, oaiHandler.getHarvestingClient().getDataverse(), dataverseRequest, deletedIdentifiers, harvesterLogger); + continue; + } + + MutableBoolean getRecordErrorOccurred = new MutableBoolean(false); + + Metadata oaiMetadata = oaiRecord.getMetadata(); + + // Retrieve and process this record with a separate GetRecord call: + Long datasetId = processRecord(dataverseRequest, harvesterLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient); + + if (datasetId != null) { + harvestedDatasetIds.add(datasetId); + } + + if (getRecordErrorOccurred.booleanValue() == true) { + failedIdentifiers.add(identifier); + //can be uncommented out for testing failure handling: + //throw new IOException("Exception occured, stopping harvest"); + } + } + } private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, List deletedIdentifiers, Date dateStamp, HttpClient httpClient) { String errMessage = null; diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java index ec26729b685..c7ab91b78be 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/HarvestingClient.java @@ -253,9 +253,9 @@ public void setAllowHarvestingMissingCVV(boolean allowHarvestingMissingCVV) { this.allowHarvestingMissingCVV = allowHarvestingMissingCVV; } - private Boolean useListRecords; + private boolean useListRecords; - public Boolean isUseListRecords() { + public boolean isUseListRecords() { return useListRecords; } @@ -263,9 +263,9 @@ public void setUseListrecords(boolean useListRecords) { this.useListRecords = useListRecords; } - private Boolean useOaiIdAsPid; + private boolean useOaiIdAsPid; - public Boolean isUseOaiIdentifiersAsPids() { + public boolean isUseOaiIdentifiersAsPids() { return useOaiIdAsPid; } diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/client/oai/OaiHandler.java b/src/main/java/edu/harvard/iq/dataverse/harvest/client/oai/OaiHandler.java index bb3dc06972c..07c193d3ae2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/client/oai/OaiHandler.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/client/oai/OaiHandler.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.harvest.client.oai; import io.gdcc.xoai.model.oaipmh.Granularity; +import io.gdcc.xoai.model.oaipmh.results.Record; import io.gdcc.xoai.model.oaipmh.results.record.Header; import io.gdcc.xoai.model.oaipmh.results.MetadataFormat; import io.gdcc.xoai.model.oaipmh.results.Set; @@ -11,6 +12,7 @@ import io.gdcc.xoai.serviceprovider.exceptions.IdDoesNotExistException; import io.gdcc.xoai.serviceprovider.model.Context; import io.gdcc.xoai.serviceprovider.parameters.ListIdentifiersParameters; +import io.gdcc.xoai.serviceprovider.parameters.ListRecordsParameters; import edu.harvard.iq.dataverse.harvest.client.FastGetRecord; import static edu.harvard.iq.dataverse.harvest.client.HarvesterServiceBean.DATAVERSE_PROPRIETARY_METADATA_API; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; @@ -258,6 +260,16 @@ public Iterator
runListIdentifiers() throws OaiHandlerException { } + public Iterator runListRecords() throws OaiHandlerException { + ListRecordsParameters parameters = buildListRecordsParams(); + try { + return getServiceProvider().listRecords(parameters); + } catch (BadArgumentException bae) { + throw new OaiHandlerException("BadArgumentException thrown when attempted to run ListRecords"); + } + + } + public FastGetRecord runGetRecord(String identifier, HttpClient httpClient) throws OaiHandlerException { if (StringUtils.isEmpty(this.baseOaiUrl)) { throw new OaiHandlerException("Attempted to execute GetRecord without server URL specified."); @@ -299,6 +311,25 @@ private ListIdentifiersParameters buildListIdentifiersParams() throws OaiHandler return mip; } + private ListRecordsParameters buildListRecordsParams() throws OaiHandlerException { + ListRecordsParameters mip = ListRecordsParameters.request(); + + if (StringUtils.isEmpty(this.metadataPrefix)) { + throw new OaiHandlerException("Attempted to create a ListRecords request without metadataPrefix specified"); + } + mip.withMetadataPrefix(metadataPrefix); + + if (this.fromDate != null) { + mip.withFrom(this.fromDate.toInstant()); + } + + if (!StringUtils.isEmpty(this.setName)) { + mip.withSetSpec(this.setName); + } + + return mip; + } + public String getProprietaryDataverseMetadataURL(String identifier) { if (dataverseApiUrl == null) {