incremental. #10734

IQSS · Sep 10, 2024 · 3fab4c8 · 3fab4c8
1 parent e62fc55
commit 3fab4c8
Show file tree

Hide file tree

Showing 2 changed files with 238 additions and 51 deletions.
diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java
@@ -33,6 +33,7 @@
 import edu.harvard.iq.dataverse.engine.command.impl.CreateHarvestedDatasetCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.CreateNewDatasetCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand;
+import edu.harvard.iq.dataverse.engine.command.impl.UpdateHarvestedDatasetCommand;
 import edu.harvard.iq.dataverse.harvest.client.HarvestingClient;
 import edu.harvard.iq.dataverse.search.IndexServiceBean;
 import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
@@ -43,6 +44,7 @@
 import edu.harvard.iq.dataverse.license.LicenseServiceBean;
 import edu.harvard.iq.dataverse.pidproviders.PidUtil;
 import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
+import edu.harvard.iq.dataverse.util.DatasetFieldUtil;
 
 import java.io.File;
 import java.io.FileOutputStream;
@@ -366,25 +368,29 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                     f.setSingleValue(DatasetField.NA_VALUE);
                 }
             }
-
+
+            // @todo? - is this the right place to call tidyUpFields()? 
+            // usually it is called within the body of the create/update commands.
+            DatasetFieldUtil.tidyUpFields(harvestedVersion.getDatasetFields(), true);
+
             // Check data against validation constraints
             // Similarly to how we handle missing required values (above), we 
             // replace invalid values with NA when harvesting.
 
-            boolean sanitized = validateDatasetVersion(harvestedVersion, true, cleanupLog);
+            boolean sanitized = validateAndSanitizeVersionMetadata(harvestedVersion, cleanupLog);
 
             // Note: this sanitizing approach, of replacing invalid values with 
             // "NA" does not work with certain fields. For example, using it to 
             // populate a GeoBox coordinate value will result in an invalid 
-            // field. So we will attempt to validate the santized version again,
-            // this time around, it will throw an exception if still invalid, so 
-            // we'll stop before proceeding any further. 
+            // field. So we will attempt to re-validate the santized version.
+            // This time around, it will throw an exception if still invalid, so 
+            // that we'll stop before proceeding any further: 
 
             if (sanitized) {
-
+                validateVersionMetadata(harvestedVersion, cleanupLog);
             }
 
-            Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
+            /*Set<ConstraintViolation> invalidViolations = harvestedVersion.validate();
             if (!invalidViolations.isEmpty()) {
                 for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
                     DatasetFieldValue f = v.getRootBean();
@@ -397,7 +403,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                     // using it to populate a GeoBox coordinate value is going 
                     // to result in an invalid field. @todo? - see below
                 }
-            }
+            }*/
 
             // @todo? - re-validate the version before we do anything else? 
             // something along the lines of 
@@ -407,6 +413,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
             if (existingDataset != null) {
                 // @todo
                 // ... do the magic - parse the version json, do the switcheroo ...
+                /*
                 DatasetVersion existingVersion = existingDataset.getVersions().get(0);
 
                 Map<String, Integer> existingFilesIndex = new HashMap<>(); 
@@ -490,6 +497,9 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
                 // UpdateHarvestedDatasetCommand() ? (later on)
                 importedDataset = em.merge(existingDataset);
                 //@todo reindex 
+                */
+
+                importedDataset = engineSvc.submit(new UpdateHarvestedDatasetCommand(existingDataset, harvestedVersion, dataverseRequest));
 
             } else {
                 importedDataset = engineSvc.submit(new CreateHarvestedDatasetCommand(harvestedDataset, dataverseRequest));
@@ -512,49 +522,6 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
         }
         return importedDataset;
     }
-    /**
-     * Shortcut method for validating AND attempting to sanitize a DatasetVersion
-     * @param version
-     * @param cleanupLog - any invalid values and their replacements are logged there
-     * @return true if any invalid values were encountered and sanitized
-     * @throws ImportException (although it should never happen in this mode)
-     */
-    private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException {
-        return validateVersionMetadata(version, true, cleanupLog);
-    }
-
-    private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException {
-        validateVersionMetadata(version, false, log);
-    } 
-
-    private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException {
-        boolean fixed = false;
-        Set<ConstraintViolation> invalidViolations = version.validate();
-        if (!invalidViolations.isEmpty()) {
-            for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
-                DatasetFieldValue f = v.getRootBean();
-
-                String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
-                        + "Invalid value:  '" + f.getValue() + "'";
-                if (sanitize) {
-                    msg += ", replaced with '" + DatasetField.NA_VALUE + "'";
-                    f.setValue(DatasetField.NA_VALUE);
-                    fixed = true;
-                }
-                cleanupLog.println(msg);
-
-                // Note: "NA" does not work with certain fields. For example, 
-                // using it to populate a GeoBox coordinate value is going 
-                // to result in an invalid field. So we'll need to validate the 
-                // version again after the first, sanitizing pass and see if it 
-                // helped or not.
-            }
-            if (!sanitize) {
-                throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values.");
-            }
-        }
-        return fixed;
-    }
 
     public JsonObject ddiToJson(String xmlToParse) throws ImportException, XMLStreamException {
         DatasetDTO dsDTO = importDDIService.doImport(ImportType.IMPORT, xmlToParse);
@@ -855,6 +822,67 @@ private String convertInvalidDateString(String inString){
         return null;
     }
 
+    /**
+     * A shortcut method for validating AND attempting to sanitize a DatasetVersion
+     * @param version
+     * @param cleanupLog - any invalid values and their replacements are logged there
+     * @return true if any invalid values were encountered and sanitized
+     * @throws ImportException (although it should never happen in this mode)
+     */
+    private boolean validateAndSanitizeVersionMetadata(DatasetVersion version, PrintWriter cleanupLog) throws ImportException {
+        return validateVersionMetadata(version, true, cleanupLog);
+    }
+
+    /**
+     * A shortcut method for validating a DatasetVersion; will throw an exception 
+     * if invalid, without attempting to sanitize the invalid values. 
+     * @param version
+     * @param log - will log the invalid fields encountered there 
+     * @throws ImportException 
+     */
+    private void validateVersionMetadata(DatasetVersion version, PrintWriter log) throws ImportException {
+        validateVersionMetadata(version, false, log);
+    } 
+
+    /**
+     * Validate the metadata fields of a newly-created version, and depending on 
+     * the "sanitize" flag supplied, may or may not attempt to sanitize the supplied
+     * values by replacing them with "NA"s. 
+     * @param version
+     * @param sanitize - boolean indicating whether to attempt to fix invalid values
+     * @param cleanupLog - to log any invalid values encountered will be logged
+     * @return - true if any invalid values have been replaced  
+     * @throws ImportException 
+     */
+    private boolean validateVersionMetadata(DatasetVersion version, boolean sanitize, PrintWriter cleanupLog) throws ImportException {
+        boolean fixed = false;
+        Set<ConstraintViolation> invalidViolations = version.validate();
+        if (!invalidViolations.isEmpty()) {
+            for (ConstraintViolation<DatasetFieldValue> v : invalidViolations) {
+                DatasetFieldValue f = v.getRootBean();
+
+                String msg = "Invalid metadata field: " + f.getDatasetField().getDatasetFieldType().getDisplayName() + "; "
+                        + "Invalid value:  '" + f.getValue() + "'";
+                if (sanitize) {
+                    msg += ", replaced with '" + DatasetField.NA_VALUE + "'";
+                    f.setValue(DatasetField.NA_VALUE);
+                    fixed = true;
+                }
+                cleanupLog.println(msg);
+
+                // Note: "NA" does not work with certain fields. For example, 
+                // using it to populate a GeoBox coordinate value is going 
+                // to result in an invalid field. So we'll need to validate the 
+                // version again after the first, sanitizing pass and see if it 
+                // helped or not.
+            }
+            if (!sanitize) {
+                throw new ImportException("Version was still failing validation after the first attempt to sanitize the invalid values.");
+            }
+        }
+        return fixed;
+    }
+
 
     private static class MyCustomFormatter extends Formatter {
 

diff --git a/...main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java b/...main/java/edu/harvard/iq/dataverse/engine/command/impl/UpdateHarvestedDatasetCommand.java
@@ -0,0 +1,159 @@
+package edu.harvard.iq.dataverse.engine.command.impl;
+
+import edu.harvard.iq.dataverse.authorization.Permission;
+import edu.harvard.iq.dataverse.engine.command.CommandContext;
+import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
+import edu.harvard.iq.dataverse.engine.command.RequiredPermissions;
+import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
+import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException;
+import edu.harvard.iq.dataverse.Dataset;
+import edu.harvard.iq.dataverse.DatasetVersion;
+import edu.harvard.iq.dataverse.DataFile;
+import edu.harvard.iq.dataverse.FileMetadata;
+import static edu.harvard.iq.dataverse.search.IndexServiceBean.solrDocIdentifierFile;
+import java.io.IOException;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.apache.solr.client.solrj.SolrServerException;
+
+/**
+ *
+ * @author landreev
+ * 
+ * Much simplified version of UpdateDatasetVersionCommand, 
+ * but with some extra twists. 
+ */
+@RequiredPermissions(Permission.EditDataset)
+public class UpdateHarvestedDatasetCommand extends AbstractDatasetCommand<Dataset> {
+
+    private static final Logger logger = Logger.getLogger(UpdateHarvestedDatasetCommand.class.getCanonicalName());
+    private final DatasetVersion newHarvestedVersion;
+    final private boolean validateLenient = true;
+
+    public UpdateHarvestedDatasetCommand(Dataset theDataset, DatasetVersion newHarvestedVersion, DataverseRequest aRequest) {
+        super(aRequest, theDataset);
+        this.newHarvestedVersion = newHarvestedVersion;
+    }
+
+    public boolean isValidateLenient() {
+        return validateLenient;
+    }
+
+    @Override
+    public Dataset execute(CommandContext ctxt) throws CommandException {
+
+        // ... do the magic - parse the version json, do the switcheroo ...
+        Dataset existingDataset = getDataset();
+
+        if (existingDataset == null
+                || existingDataset.getId() == null
+                || !existingDataset.isHarvested()
+                || existingDataset.getVersions().size() != 1) {
+            throw new IllegalCommandException("The command can only be called on an existing harvested dataset with only 1 version", this);
+        }
+        DatasetVersion existingVersion = existingDataset.getVersions().get(0);
+
+        if (newHarvestedVersion == null || newHarvestedVersion.getId() != null) {
+            throw new IllegalCommandException("The command can only be called with a newly-harvested, not yet saved DatasetVersion supplied", this);
+        }
+
+        Map<String, Integer> existingFilesIndex = new HashMap<>();
+
+        for (int i = 0; i < existingDataset.getFiles().size(); i++) {
+            String storageIdentifier = existingDataset.getFiles().get(i).getStorageIdentifier();
+            if (storageIdentifier != null) {
+                existingFilesIndex.put(storageIdentifier, i);
+            }
+        }
+
+        for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
+            // is it safe to assume that each new FileMetadata will be 
+            // pointing to a non-null DataFile here?
+            String location = newFileMetadata.getDataFile().getStorageIdentifier();
+            if (location != null && existingFilesIndex.containsKey(location)) {
+                newFileMetadata.getDataFile().setFileMetadatas(new ArrayList<>());
+
+                int fileIndex = existingFilesIndex.get(location);
+                newFileMetadata.setDataFile(existingDataset.getFiles().get(fileIndex));
+                existingDataset.getFiles().get(fileIndex).getFileMetadatas().add(newFileMetadata);
+                existingFilesIndex.remove(location);
+            }
+        }
+        // @todo check that the newly-harvested DataFiles have the same checksums
+        // and mime types etc. These values are supposed to be immutable, normally, 
+        // but who knows - they may have fixed something invalid on the other end
+        // @todo check if there's anything special that needs to be done for things
+        // like file categories
+
+        List<String> solrIdsOfDocumentsToDelete = new ArrayList<>();
+
+        // Go through the existing files and delete the ones that are 
+        // no longer present in the version that we have just harvesed:
+        for (FileMetadata oldFileMetadata : existingDataset.getVersions().get(0).getFileMetadatas()) {
+            DataFile oldDataFile = oldFileMetadata.getDataFile();
+            solrIdsOfDocumentsToDelete.add(solrDocIdentifierFile + oldDataFile.getId());
+            existingDataset.getFiles().remove(oldDataFile);
+            // Files from harvested datasets are removed unceremoniously, 
+            // directly in the database. No need to bother calling the 
+            // DeleteFileCommand on them.
+            ctxt.em().remove(ctxt.em().merge(oldDataFile));
+            ctxt.em().remove(ctxt.em().merge(oldFileMetadata));
+            oldDataFile = null;
+            oldFileMetadata = null;
+        }
+
+        // purge all the SOLR documents associated with the files
+        // we have just deleted:
+        if (!solrIdsOfDocumentsToDelete.isEmpty()) {
+            ctxt.index().deleteHarvestedDocuments(solrIdsOfDocumentsToDelete);
+        }
+
+        // ... And now delete the existing version itself: 
+        existingDataset.setVersions(new ArrayList<>());
+        ctxt.em().remove(ctxt.em().merge(existingVersion));
+
+        // Now attach the newly-harvested version to the dataset:
+        existingDataset.getVersions().add(newHarvestedVersion);
+        newHarvestedVersion.setDataset(existingDataset);
+
+        // ... There's one more thing to do - go through the new files, 
+        // that are not in the database yet, and make sure they are 
+        // attached to this existing dataset:
+        for (FileMetadata newFileMetadata : newHarvestedVersion.getFileMetadatas()) {
+            if (newFileMetadata.getDataFile().getId() == null) {
+                existingDataset.getFiles().add(newFileMetadata.getDataFile());
+                newFileMetadata.getDataFile().setOwner(existingDataset);
+            }
+        }
+
+        ctxt.em().persist(newHarvestedVersion);
+
+        Dataset savedDataset = ctxt.em().merge(existingDataset);
+        ctxt.em().flush();
+
+        //@todo reindex 
+
+        return savedDataset;
+    }
+
+    @Override
+    public boolean onSuccess(CommandContext ctxt, Object r) {
+        boolean retVal = true;
+        Dataset d = (Dataset) r;
+
+        try {
+            // Note that we index harvested datasets synchronously:
+            ctxt.index().indexDataset(d, true);
+        } catch (SolrServerException|IOException solrServerEx) {
+            logger.log(Level.WARNING, "Exception while trying to index the updated Harvested dataset " + d.getGlobalId().asString(), solrServerEx.getMessage());
+            retVal = false;
+        }
+
+        return retVal;
+    }
+}