Skip to content

Commit

Permalink
draft ListRecords framework (only works with a locally patched xoai-5…
Browse files Browse the repository at this point in the history
….2.0) #10909
  • Loading branch information
landreev committed Oct 23, 2024
1 parent 6d336c8 commit b330d3e
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,16 @@
import org.apache.commons.lang3.mutable.MutableBoolean;
import org.xml.sax.SAXException;

import io.gdcc.xoai.model.oaipmh.results.Record;
import io.gdcc.xoai.model.oaipmh.results.record.Header;
import io.gdcc.xoai.model.oaipmh.results.record.Metadata;
import edu.harvard.iq.dataverse.EjbDataverseEngine;
import edu.harvard.iq.dataverse.api.imports.ImportServiceBean;
import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler;
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandlerException;
import edu.harvard.iq.dataverse.search.IndexServiceBean;
import io.gdcc.xoai.xml.XmlWriter;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
Expand All @@ -53,6 +56,7 @@
import java.nio.file.Path;
import jakarta.persistence.EntityManager;
import jakarta.persistence.PersistenceContext;
import javax.xml.stream.XMLStreamException;

/**
*
Expand Down Expand Up @@ -232,48 +236,104 @@ private void harvestOAI(DataverseRequest dataverseRequest, HarvestingClient harv
httpClient = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.ALWAYS).build();

try {
for (Iterator<Header> idIter = oaiHandler.runListIdentifiers(); idIter.hasNext();) {
// Before each iteration, check if this harvesting job needs to be aborted:
if (checkIfStoppingJob(harvestingClient)) {
throw new StopHarvestException("Harvesting stopped by external request");
}
if (harvestingClient.isUseListRecords()) {
harvestOAIviaListRecords(oaiHandler, dataverseRequest, harvestingClient, httpClient, failedIdentifiers, deletedIdentifiers, harvestedDatasetIds, hdLogger, importCleanupLog);
} else {
// The default behavior is to use ListIdentifiers:
harvestOAIviaListIdentifiers(oaiHandler, dataverseRequest, harvestingClient, httpClient, failedIdentifiers, deletedIdentifiers, harvestedDatasetIds, hdLogger, importCleanupLog);
}
} catch (OaiHandlerException e) {
throw new IOException("Failed to run ListIdentifiers: " + e.getMessage());
}

Header h = idIter.next();
String identifier = h.getIdentifier();
Date dateStamp = Date.from(h.getDatestamp());

hdLogger.info("processing identifier: " + identifier + ", date: " + dateStamp);

if (h.isDeleted()) {
hdLogger.info("Deleting harvesting dataset for " + identifier + ", per ListIdentifiers.");
logCompletedOaiHarvest(hdLogger, harvestingClient);

deleteHarvestedDatasetIfExists(identifier, oaiHandler.getHarvestingClient().getDataverse(), dataverseRequest, deletedIdentifiers, hdLogger);
continue;
}
}

private void harvestOAIviaListIdentifiers(OaiHandler oaiHandler, DataverseRequest dataverseRequest, HarvestingClient harvestingClient, HttpClient httpClient, List<String> failedIdentifiers, List<String> deletedIdentifiers, List<Long> harvestedDatasetIds, Logger harvesterLogger, PrintWriter importCleanupLog) throws OaiHandlerException, StopHarvestException {
for (Iterator<Header> idIter = oaiHandler.runListIdentifiers(); idIter.hasNext();) {
// Before each iteration, check if this harvesting job needs to be aborted:
if (checkIfStoppingJob(harvestingClient)) {
throw new StopHarvestException("Harvesting stopped by external request");
}

MutableBoolean getRecordErrorOccurred = new MutableBoolean(false);
Header h = idIter.next();
String identifier = h.getIdentifier();
Date dateStamp = Date.from(h.getDatestamp());

// Retrieve and process this record with a separate GetRecord call:

Long datasetId = processRecord(dataverseRequest, hdLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient);

if (datasetId != null) {
harvestedDatasetIds.add(datasetId);
}

if (getRecordErrorOccurred.booleanValue() == true) {
failedIdentifiers.add(identifier);
//can be uncommented out for testing failure handling:
//throw new IOException("Exception occured, stopping harvest");
}
harvesterLogger.info("ListIdentifiers; processing identifier: " + identifier + ", date: " + dateStamp);

if (h.isDeleted()) {
harvesterLogger.info("ListIdentifiers; deleting harvesting dataset for " + identifier);

deleteHarvestedDatasetIfExists(identifier, oaiHandler.getHarvestingClient().getDataverse(), dataverseRequest, deletedIdentifiers, harvesterLogger);
continue;
}

MutableBoolean getRecordErrorOccurred = new MutableBoolean(false);

// Retrieve and process this record with a separate GetRecord call:
Long datasetId = processRecord(dataverseRequest, harvesterLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient);

if (datasetId != null) {
harvestedDatasetIds.add(datasetId);
}

if (getRecordErrorOccurred.booleanValue() == true) {
failedIdentifiers.add(identifier);
//can be uncommented out for testing failure handling:
//throw new IOException("Exception occured, stopping harvest");
}
} catch (OaiHandlerException e) {
throw new IOException("Failed to run ListIdentifiers: " + e.getMessage());
}
}

private void harvestOAIviaListRecords(OaiHandler oaiHandler, DataverseRequest dataverseRequest, HarvestingClient harvestingClient, HttpClient httpClient, List<String> failedIdentifiers, List<String> deletedIdentifiers, List<Long> harvestedDatasetIds, Logger harvesterLogger, PrintWriter importCleanupLog) throws OaiHandlerException, StopHarvestException {
for (Iterator<Record> idIter = oaiHandler.runListRecords(); idIter.hasNext();) {
// Before each iteration, check if this harvesting job needs to be aborted:
if (checkIfStoppingJob(harvestingClient)) {
throw new StopHarvestException("Harvesting stopped by external request");
}

logCompletedOaiHarvest(hdLogger, harvestingClient);
Record oaiRecord = idIter.next();

try {
harvesterLogger.info("record.getMetadata() (via XmlWriter):" + XmlWriter.toString(oaiRecord.getMetadata()));
} catch (XMLStreamException xsx) {
harvesterLogger.info("Caught an XMLStreamException: " + xsx.getMessage());
}


Header h = oaiRecord.getHeader();
String identifier = h.getIdentifier();
Date dateStamp = Date.from(h.getDatestamp());

}
harvesterLogger.info("ListRecords; processing identifier : " + identifier + ", date: " + dateStamp);

if (h.isDeleted()) {
harvesterLogger.info("ListRecords; Deleting harvested dataset for " + identifier);

deleteHarvestedDatasetIfExists(identifier, oaiHandler.getHarvestingClient().getDataverse(), dataverseRequest, deletedIdentifiers, harvesterLogger);
continue;
}

MutableBoolean getRecordErrorOccurred = new MutableBoolean(false);

Metadata oaiMetadata = oaiRecord.getMetadata();

// Retrieve and process this record with a separate GetRecord call:
Long datasetId = processRecord(dataverseRequest, harvesterLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, deletedIdentifiers, dateStamp, httpClient);

if (datasetId != null) {
harvestedDatasetIds.add(datasetId);
}

if (getRecordErrorOccurred.booleanValue() == true) {
failedIdentifiers.add(identifier);
//can be uncommented out for testing failure handling:
//throw new IOException("Exception occured, stopping harvest");
}
}
}

private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, List<String> deletedIdentifiers, Date dateStamp, HttpClient httpClient) {
String errMessage = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,19 +253,19 @@ public void setAllowHarvestingMissingCVV(boolean allowHarvestingMissingCVV) {
this.allowHarvestingMissingCVV = allowHarvestingMissingCVV;
}

private Boolean useListRecords;
private boolean useListRecords;

public Boolean isUseListRecords() {
public boolean isUseListRecords() {
return useListRecords;
}

public void setUseListrecords(boolean useListRecords) {
this.useListRecords = useListRecords;
}

private Boolean useOaiIdAsPid;
private boolean useOaiIdAsPid;

public Boolean isUseOaiIdentifiersAsPids() {
public boolean isUseOaiIdentifiersAsPids() {
return useOaiIdAsPid;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.iq.dataverse.harvest.client.oai;

import io.gdcc.xoai.model.oaipmh.Granularity;
import io.gdcc.xoai.model.oaipmh.results.Record;
import io.gdcc.xoai.model.oaipmh.results.record.Header;
import io.gdcc.xoai.model.oaipmh.results.MetadataFormat;
import io.gdcc.xoai.model.oaipmh.results.Set;
Expand All @@ -11,6 +12,7 @@
import io.gdcc.xoai.serviceprovider.exceptions.IdDoesNotExistException;
import io.gdcc.xoai.serviceprovider.model.Context;
import io.gdcc.xoai.serviceprovider.parameters.ListIdentifiersParameters;
import io.gdcc.xoai.serviceprovider.parameters.ListRecordsParameters;
import edu.harvard.iq.dataverse.harvest.client.FastGetRecord;
import static edu.harvard.iq.dataverse.harvest.client.HarvesterServiceBean.DATAVERSE_PROPRIETARY_METADATA_API;
import edu.harvard.iq.dataverse.harvest.client.HarvestingClient;
Expand Down Expand Up @@ -258,6 +260,16 @@ public Iterator<Header> runListIdentifiers() throws OaiHandlerException {

}

public Iterator<Record> runListRecords() throws OaiHandlerException {
ListRecordsParameters parameters = buildListRecordsParams();
try {
return getServiceProvider().listRecords(parameters);
} catch (BadArgumentException bae) {
throw new OaiHandlerException("BadArgumentException thrown when attempted to run ListRecords");
}

}

public FastGetRecord runGetRecord(String identifier, HttpClient httpClient) throws OaiHandlerException {
if (StringUtils.isEmpty(this.baseOaiUrl)) {
throw new OaiHandlerException("Attempted to execute GetRecord without server URL specified.");
Expand Down Expand Up @@ -299,6 +311,25 @@ private ListIdentifiersParameters buildListIdentifiersParams() throws OaiHandler
return mip;
}

private ListRecordsParameters buildListRecordsParams() throws OaiHandlerException {
ListRecordsParameters mip = ListRecordsParameters.request();

if (StringUtils.isEmpty(this.metadataPrefix)) {
throw new OaiHandlerException("Attempted to create a ListRecords request without metadataPrefix specified");
}
mip.withMetadataPrefix(metadataPrefix);

if (this.fromDate != null) {
mip.withFrom(this.fromDate.toInstant());
}

if (!StringUtils.isEmpty(this.setName)) {
mip.withSetSpec(this.setName);
}

return mip;
}

public String getProprietaryDataverseMetadataURL(String identifier) {

if (dataverseApiUrl == null) {
Expand Down

0 comments on commit b330d3e

Please sign in to comment.