From f149e121e42496f04cdc7d9190d6fe6e40d01ee6 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Mon, 30 Mar 2020 22:48:35 -0400 Subject: [PATCH] Physical file validation framework, refined; (#6558) --- .../edu/harvard/iq/dataverse/DatasetPage.java | 13 +++- .../iq/dataverse/DatasetServiceBean.java | 5 ++ .../FinalizeDatasetPublicationCommand.java | 65 ++++++++++++++++--- .../command/impl/PublishDatasetCommand.java | 11 +++- .../settings/SettingsServiceBean.java | 6 +- .../iq/dataverse/util/SystemConfig.java | 5 ++ src/main/java/propertyFiles/Bundle.properties | 6 ++ 7 files changed, 97 insertions(+), 14 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index 0ece7e9c4c2..a789cbdcbad 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -30,6 +30,7 @@ import edu.harvard.iq.dataverse.engine.command.impl.GetPrivateUrlCommand; import edu.harvard.iq.dataverse.engine.command.impl.LinkDatasetCommand; import edu.harvard.iq.dataverse.engine.command.impl.PublishDatasetCommand; +import edu.harvard.iq.dataverse.engine.command.impl.FinalizeDatasetPublicationCommand; import edu.harvard.iq.dataverse.engine.command.impl.PublishDataverseCommand; import edu.harvard.iq.dataverse.engine.command.impl.UpdateDatasetVersionCommand; import edu.harvard.iq.dataverse.export.ExportException; @@ -2017,8 +2018,16 @@ private String init(boolean initFull) { datasetService.removeDatasetLocks(dataset.getId(), DatasetLock.Reason.pidRegister); }*/ if (dataset.isLockedFor(DatasetLock.Reason.pidRegister)) { - JH.addMessage(FacesMessage.SEVERITY_WARN, BundleUtil.getStringFromBundle("dataset.publish.workflow.message"), - BundleUtil.getStringFromBundle("dataset.pidRegister.workflow.inprogress")); + // the "pidRegister" lock is used to lock the dataset for BOTH the + // asynchronous persistent id registration for files AND (or) + // physical file validation. + if (FinalizeDatasetPublicationCommand.FILE_VALIDATION_ERROR.equals(dataset.getLockFor(DatasetLock.Reason.pidRegister).getInfo())) { + JH.addMessage(FacesMessage.SEVERITY_ERROR, BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.message"), + BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.details")); + } else { + JH.addMessage(FacesMessage.SEVERITY_WARN, BundleUtil.getStringFromBundle("dataset.publish.workflow.message"), + BundleUtil.getStringFromBundle("dataset.pidRegister.workflow.inprogress")); + } } if (dataset.isLockedFor(DatasetLock.Reason.EditInProgress)) { String rootDataverseName = dataverseService.findRootDataverse().getName(); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java index 9c0eee6fbf3..53713c3ce6f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java @@ -462,6 +462,11 @@ public void removeDatasetLocks(Dataset dataset, DatasetLock.Reason aReason) { } } + @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) + public void updateDatasetLock(DatasetLock datasetLock) { + em.merge(datasetLock); + } + /* getTitleFromLatestVersion methods use native query to return a dataset title diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java index 482508ccc84..7b678d957bd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java @@ -57,6 +57,8 @@ public class FinalizeDatasetPublicationCommand extends AbstractPublishDatasetCom */ final boolean datasetExternallyReleased; + public static final String FILE_VALIDATION_ERROR = "FILE VALIDATION ERROR"; + public FinalizeDatasetPublicationCommand(Dataset aDataset, DataverseRequest aRequest) { this( aDataset, aRequest, false ); } @@ -244,9 +246,11 @@ private void validateDataFiles(Dataset dataset, CommandContext ctxt) throws Comm // systemConfig.getFileFixityChecksumAlgorithm() DataFile.ChecksumType checksumType = dataFile.getChecksumType(); if (checksumType == null) { - throw new Exception(BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.noChecksumType", Arrays.asList(dataFile.getId().toString()))); + String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.noChecksumType", Arrays.asList(dataFile.getId().toString())); + logger.log(Level.INFO, info); + throw new Exception(info); } - + StorageIO storage = dataFile.getStorageIO(); storage.open(DataAccessOption.READ_ACCESS); InputStream in = null; @@ -258,28 +262,67 @@ private void validateDataFiles(Dataset dataset, CommandContext ctxt) throws Comm // instead: in = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION); } - + if (in == null) { - throw new Exception(BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failRead", Arrays.asList(dataFile.getId().toString()))); + String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failRead", Arrays.asList(dataFile.getId().toString())); + logger.log(Level.INFO, info); + throw new Exception(info); } String recalculatedChecksum = null; try { - FileUtil.calculateChecksum(in, checksumType); + recalculatedChecksum = FileUtil.calculateChecksum(in, checksumType); } catch (RuntimeException rte) { - throw new Exception(BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failCalculateChecksum", Arrays.asList(dataFile.getId().toString()))); + recalculatedChecksum = null; } finally { IOUtils.closeQuietly(in); } + if (recalculatedChecksum == null) { + String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failCalculateChecksum", Arrays.asList(dataFile.getId().toString())); + logger.log(Level.INFO, info); + throw new Exception(info); + } + + // TODO: What should we do if the datafile does not have a non-null checksum? + // Should we fail, or should we assume that the recalculated checksum + // is correct, and populate the checksumValue field with it? + if (!recalculatedChecksum.equals(dataFile.getChecksumValue())) { - throw new Exception(BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.wrongChecksumValue", Arrays.asList(dataFile.getId().toString()))); + String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.wrongChecksumValue", Arrays.asList(dataFile.getId().toString())); + logger.log(Level.INFO, info); + throw new Exception(info); } + + logger.log(Level.INFO, "successfully validated DataFile {0}; checksum {1}", new Object[]{dataFile.getId(), recalculatedChecksum}); } } catch (Throwable e) { - ctxt.datasets().removeDatasetLocks(dataset, DatasetLock.Reason.pidRegister); - throw new CommandException(e.getMessage(), this); + // Check if there is a workflow lock on the dataset - i.e., if this + // is being done asynchronously. If so, change the lock message + // to notify the user what went wrong, and leave the lock in place: + + if (dataset.isLockedFor(DatasetLock.Reason.pidRegister)) { + DatasetLock workflowLock = dataset.getLockFor(DatasetLock.Reason.pidRegister); + workflowLock.setInfo(FILE_VALIDATION_ERROR); + ctxt.datasets().updateDatasetLock(workflowLock); + } + + // Throw a new CommandException; if the command is being called + // synchronously, it will be intercepted and the page will display + // the error message for the user. + throw new CommandException(BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.details"), this); + } + + /* + * for debugging only: (TODO: remove before making the final PR) + logger.log(Level.INFO,"Validation successful; but throwing an exception anyway, for testing purposes"); + if (dataset.isLockedFor(DatasetLock.Reason.pidRegister)) { + DatasetLock workflowLock = dataset.getLockFor(DatasetLock.Reason.pidRegister); + workflowLock.setInfo(FILE_VALIDATION_ERROR); + ctxt.datasets().updateDatasetLock(workflowLock); } + throw new CommandException(BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.details"), this); + */ } private void publicizeExternalIdentifier(Dataset dataset, CommandContext ctxt) throws CommandException { @@ -323,6 +366,10 @@ private void publicizeExternalIdentifier(Dataset dataset, CommandContext ctxt) t throw new CommandException(BundleUtil.getStringFromBundle("dataset.publish.error", args), this); } } + /* + * for debugging only: (TODO: remove before making the final PR) + throw new CommandException(BundleUtil.getStringFromBundle("dataset.publish.error", idServiceBean.getProviderInformation()), this); + */ } private void updateFiles(Timestamp updateTime, CommandContext ctxt) throws CommandException { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java index f2584d6a153..a5d5fbbe713 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/PublishDatasetCommand.java @@ -114,9 +114,16 @@ public PublishDatasetResult execute(CommandContext ctxt) throws CommandException if ( registerGlobalIdsForFiles ){ registerGlobalIdsForFiles = currentGlobalAuthority.equals( theDataset.getAuthority() ); } + + boolean validatePhysicalFiles = ctxt.systemConfig().isDatafileValidationOnPublishEnabled(); - if (theDataset.getFiles().size() > ctxt.systemConfig().getPIDAsynchRegFileCount() && registerGlobalIdsForFiles) { - String info = "Adding File PIDs asynchronously"; + if ((registerGlobalIdsForFiles || validatePhysicalFiles) + && theDataset.getFiles().size() > ctxt.systemConfig().getPIDAsynchRegFileCount()) { + // TODO: The time it takes to validate the physical files in the dataset + // is a function of the total file size, NOT the number of files; + // so that's what we should be checking. + String info = registerGlobalIdsForFiles ? "Registering PIDs for Datafiles and " : ""; + info += "Validating Datafiles Asynchronously"; AuthenticatedUser user = request.getAuthenticatedUser(); DatasetLock lock = new DatasetLock(DatasetLock.Reason.pidRegister, user); diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 710060ef817..b4049c9b1cf 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -422,7 +422,11 @@ Whether Harvesting (OAI) service is enabled /** * Shibboleth affiliation attribute which holds information about the affiliation of the user (e.g. ou) */ - ShibAffiliationAttribute + ShibAffiliationAttribute, + /** + * Validate physical files for all the datafiles in the dataset when publishing + */ + FileValidationOnPublishEnabled ; @Override diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index aefb01992f4..1012cdc37c0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -1042,4 +1042,9 @@ public String getMDCLogPath() { String mDCLogPath = settingsService.getValueForKey(SettingsServiceBean.Key.MDCLogPath, null); return mDCLogPath; } + + public boolean isDatafileValidationOnPublishEnabled() { + boolean safeDefaultIfKeyNotFound = true; + return settingsService.isTrueForKey(SettingsServiceBean.Key.FileValidationOnPublishEnabled, safeDefaultIfKeyNotFound); + } } diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 2f89a3742ae..6427efcc452 100755 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -1310,6 +1310,12 @@ dataset.locked.editInProgress.message=Edit In Progress dataset.locked.editInProgress.message.details=Additional edits cannot be made at this time. Contact {0} if this status persists. dataset.publish.error=This dataset may not be published due to an error when contacting the {0} Service. Please try again. dataset.publish.error.doi=This dataset may not be published because the DOI update failed. +dataset.publish.file.validation.error.message=Failed to Publish Dataset +dataset.publish.file.validation.error.details=The dataset could not be published because one or more of the datafiles in the dataset could not be validated (physical file missing, checksum mismatch, etc.) Please contact support for assistance. +dataset.publish.file.validation.error.noChecksumType=Checksum type not defined for datafile id {0} +dataset.publish.file.validation.error.failRead=Failed to open datafile id {0} for reading +dataset.publish.file.validation.error.failCalculateChecksum=Failed to calculate checksum for datafile id {0} +dataset.publish.file.validation.error.wrongChecksumValue=Checksum mismatch for datafile id {0} dataset.compute.computeBatchSingle=Compute Dataset dataset.compute.computeBatchList=List Batch dataset.compute.computeBatchAdd=Add to Batch