Skip to content

Commit

Permalink
incremental. #10734
Browse files Browse the repository at this point in the history
  • Loading branch information
landreev committed Sep 10, 2024
1 parent e62fc55 commit 3fab4c8
Show file tree
Hide file tree
Showing 2 changed files with 238 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import edu.harvard.iq.dataverse.engine.command.impl.CreateHarvestedDatasetCommand;
import edu.harvard.iq.dataverse.engine.command.impl.CreateNewDatasetCommand;
import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand;
import edu.harvard.iq.dataverse.engine.command.impl.UpdateHarvestedDatasetCommand;
import edu.harvard.iq.dataverse.harvest.client.HarvestingClient;
import edu.harvard.iq.dataverse.search.IndexServiceBean;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
Expand All @@ -43,6 +44,7 @@
import edu.harvard.iq.dataverse.license.LicenseServiceBean;
import edu.harvard.iq.dataverse.pidproviders.PidUtil;
import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
import edu.harvard.iq.dataverse.util.DatasetFieldUtil;

import java.io.File;
import java.io.FileOutputStream;
Expand Down Expand Up @@ -366,25 +368,29 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
f.setSingleValue(DatasetField.NA_VALUE);
}
}


// @todo? - is this the right place to call tidyUpFields()?
// usually it is called within the body of the create/update commands.
DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true);

// Check data against validation constraints
// Similarly to how we handle missing required values (above), we
// replace invalid values with NA when harvesting.

boolean sanitized = validateDatasetVersion(harvestedVersion, true, cleanupLog);
boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog);

// Note: this sanitizing approach, of replacing invalid values with
// "NA" does not work with certain fields. For example, using it to
// populate a GeoBox coordinate value will result in an invalid
// field. So we will attempt to validate the santized version again,
// this time around, it will throw an exception if still invalid, so
// we'll stop before proceeding any further.
// field. So we will attempt to re-validate the santized version.
// This time around, it will throw an exception if still invalid, so
// that we'll stop before proceeding any further:

if (sanitized) {

validateVersionMetadata(harvestedVersion, cleanupLog);
}

Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
/*Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
if (!invalidViolations.isEmpty()) {
for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
DatasetFieldValue f = v.getRootBean();
Expand All @@ -397,7 +403,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
// using it to populate a GeoBox coordinate value is going
// to result in an invalid field. @todo? - see below
}
}
}*/

// @todo? - re-validate the version before we do anything else?
// something along the lines of
Expand All @@ -407,6 +413,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
if (existingDataset != null) {
// @todo
// ... do the magic - parse the version json, do the switcheroo ...
/*
DatasetVersion existingVersion = existingDataset.getVersions().get(0);
Map<String, Integer> existingFilesIndex = new HashMap<>();
Expand Down Expand Up @@ -490,6 +497,9 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
// UpdateHarvestedDatasetCommand() ? (later on)
importedDataset = em.merge(existingDataset);
//@todo reindex
*/

importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest));

} else {
importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
Expand All @@ -512,49 +522,6 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
}
return importedDataset;
}
/**
* Shortcut method for validating AND attempting to sanitize a DatasetVersion
* @param version
* @param cleanupLog - any invalid values and their replacements are logged there
* @return true if any invalid values were encountered and sanitized
* @throws ImportException (although it should never happen in this mode)
*/
private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException {
return validateVersionMetadata(version, true, cleanupLog);
}

private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException {
validateVersionMetadata(version, false, log);
}

private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException {
boolean fixed = false;
Set<ConstraintViolation> invalidViolations = version.validate();
if (!invalidViolations.isEmpty()) {
for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
DatasetFieldValue f = v.getRootBean();

String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
+ "Invalid value: '" + f.getValue() + "'";
if (sanitize) {
msg += ", replaced with '" + DatasetField.NA_VALUE + "'";
f.setValue(DatasetField.NA_VALUE);
fixed = true;
}
cleanupLog.println(msg);

// Note: "NA" does not work with certain fields. For example,
// using it to populate a GeoBox coordinate value is going
// to result in an invalid field. So we'll need to validate the
// version again after the first, sanitizing pass and see if it
// helped or not.
}
if (!sanitize) {
throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values.");
}
}
return fixed;
}

public JsonObject ddiToJson(String xmlToParse) throws ImportException, XMLStreamException {
DatasetDTO dsDTO = importDDIService.doImport(ImportType.IMPORT, xmlToParse);
Expand Down Expand Up @@ -855,6 +822,67 @@ private String convertInvalidDateString(String inString){
return null;
}

/**
* A shortcut method for validating AND attempting to sanitize a DatasetVersion
* @param version
* @param cleanupLog - any invalid values and their replacements are logged there
* @return true if any invalid values were encountered and sanitized
* @throws ImportException (although it should never happen in this mode)
*/
private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException {
return validateVersionMetadata(version, true, cleanupLog);
}

/**
* A shortcut method for validating a DatasetVersion; will throw an exception
* if invalid, without attempting to sanitize the invalid values.
* @param version
* @param log - will log the invalid fields encountered there
* @throws ImportException
*/
private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException {
validateVersionMetadata(version, false, log);
}

/**
* Validate the metadata fields of a newly-created version, and depending on
* the "sanitize" flag supplied, may or may not attempt to sanitize the supplied
* values by replacing them with "NA"s.
* @param version
* @param sanitize - boolean indicating whether to attempt to fix invalid values
* @param cleanupLog - to log any invalid values encountered will be logged
* @return - true if any invalid values have been replaced
* @throws ImportException
*/
private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException {
boolean fixed = false;
Set<ConstraintViolation> invalidViolations = version.validate();
if (!invalidViolations.isEmpty()) {
for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
DatasetFieldValue f = v.getRootBean();

String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
+ "Invalid value: '" + f.getValue() + "'";
if (sanitize) {
msg += ", replaced with '" + DatasetField.NA_VALUE + "'";
f.setValue(DatasetField.NA_VALUE);
fixed = true;
}
cleanupLog.println(msg);

// Note: "NA" does not work with certain fields. For example,
// using it to populate a GeoBox coordinate value is going
// to result in an invalid field. So we'll need to validate the
// version again after the first, sanitizing pass and see if it
// helped or not.
}
if (!sanitize) {
throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values.");
}
}
return fixed;
}


private static class MyCustomFormatter extends Formatter {

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
package edu.harvard.iq.dataverse.engine.command.impl;

import edu.harvard.iq.dataverse.authorization.Permission;
import edu.harvard.iq.dataverse.engine.command.CommandContext;
import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
import edu.harvard.iq.dataverse.engine.command.RequiredPermissions;
import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException;
import edu.harvard.iq.dataverse.Dataset;
import edu.harvard.iq.dataverse.DatasetVersion;
import edu.harvard.iq.dataverse.DataFile;
import edu.harvard.iq.dataverse.FileMetadata;
import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
import java.io.IOException;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.solr.client.solrj.SolrServerException;

/**
*
* @author landreev
*
* Much simplified version of UpdateDatasetVersionCommand,
* but with some extra twists.
*/
@RequiredPermissions(Permission.EditDataset)
public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand<Dataset> {

private static final Logger logger = Logger.getLogger(UpdateHarvestedDatasetCommand.class.getCanonicalName());
private final DatasetVersion newHarvestedVersion;
final private boolean validateLenient = true;

public UpdateHarvestedDatasetCommand(Dataset theDataset, DatasetVersion newHarvestedVersion, DataverseRequest aRequest) {
super(aRequest, theDataset);
this.newHarvestedVersion = newHarvestedVersion;
}

public boolean isValidateLenient() {
return validateLenient;
}

@Override
public Dataset execute(CommandContext ctxt) throws CommandException {

// ... do the magic - parse the version json, do the switcheroo ...
Dataset existingDataset = getDataset();

if (existingDataset == null
|| existingDataset.getId() == null
|| !existingDataset.isHarvested()
|| existingDataset.getVersions().size() != 1) {
throw new IllegalCommandException("The command can only be called on an existing harvested dataset with only 1 version", this);
}
DatasetVersion existingVersion = existingDataset.getVersions().get(0);

if (newHarvestedVersion == null || newHarvestedVersion.getId() != null) {
throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this);
}

Map<String, Integer> existingFilesIndex = new HashMap<>();

for (int i = 0; i < existingDataset.getFiles().size(); i++) {
String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
if (storageIdentifier != null) {
existingFilesIndex.put(storageIdentifier, i);
}
}

for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
// is it safe to assume that each new FileMetadata will be
// pointing to a non-null DataFile here?
String location = newFileMetadata.getDataFile().getStorageIdentifier();
if (location != null && existingFilesIndex.containsKey(location)) {
newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());

int fileIndex = existingFilesIndex.get(location);
newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
existingFilesIndex.remove(location);
}
}
// @todo check that the newly-harvested DataFiles have the same checksums
// and mime types etc. These values are supposed to be immutable, normally,
// but who knows - they may have fixed something invalid on the other end
// @todo check if there's anything special that needs to be done for things
// like file categories

List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();

// Go through the existing files and delete the ones that are
// no longer present in the version that we have just harvesed:
for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
DataFile oldDataFile = oldFileMetadata.getDataFile();
solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
existingDataset.getFiles().remove(oldDataFile);
// Files from harvested datasets are removed unceremoniously,
// directly in the database. No need to bother calling the
// DeleteFileCommand on them.
ctxt.em().remove(ctxt.em().merge(oldDataFile));
ctxt.em().remove(ctxt.em().merge(oldFileMetadata));
oldDataFile = null;
oldFileMetadata = null;
}

// purge all the SOLR documents associated with the files
// we have just deleted:
if (!solrIdsOfDocumentsToDelete.isEmpty()) {
ctxt.index().deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
}

// ... And now delete the existing version itself:
existingDataset.setVersions(new ArrayList<>());
ctxt.em().remove(ctxt.em().merge(existingVersion));

// Now attach the newly-harvested version to the dataset:
existingDataset.getVersions().add(newHarvestedVersion);
newHarvestedVersion.setDataset(existingDataset);

// ... There's one more thing to do - go through the new files,
// that are not in the database yet, and make sure they are
// attached to this existing dataset:
for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
if (newFileMetadata.getDataFile().getId() == null) {
existingDataset.getFiles().add(newFileMetadata.getDataFile());
newFileMetadata.getDataFile().setOwner(existingDataset);
}
}

ctxt.em().persist(newHarvestedVersion);

Dataset savedDataset = ctxt.em().merge(existingDataset);
ctxt.em().flush();

//@todo reindex

return savedDataset;
}

@Override
public boolean onSuccess(CommandContext ctxt, Object r) {
boolean retVal = true;
Dataset d = (Dataset) r;

try {
// Note that we index harvested datasets synchronously:
ctxt.index().indexDataset(d, true);
} catch (SolrServerException|IOException solrServerEx) {
logger.log(Level.WARNING, "Exception while trying to index the updated Harvested dataset " + d.getGlobalId().asString(), solrServerEx.getMessage());
retVal = false;
}

return retVal;
}
}

0 comments on commit 3fab4c8

Please sign in to comment.