Skip to content

Commit

Permalink
Added /admin API call that an admin can use to find the files that ha…
Browse files Browse the repository at this point in the history
…ve failed validation,

that need to be fixed or removed before the dataset can be published. (#6558)
  • Loading branch information
landreev committed Apr 7, 2020
1 parent 45cfa0f commit c3fbad2
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 53 deletions.
1 change: 0 additions & 1 deletion src/main/java/edu/harvard/iq/dataverse/DatasetPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -2059,7 +2059,6 @@ private void displayLockInfo(Dataset dataset) {
JH.addMessage(FacesMessage.SEVERITY_ERROR, BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.message"),
BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.contactSupport"));
}
/* and now that we've shown the message to the user - remove the lock? */
}
if (dataset.isLockedFor(DatasetLock.Reason.EditInProgress)) {
String rootDataverseName = dataverseService.findRootDataverse().getName();
Expand Down
70 changes: 70 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/api/Admin.java
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,76 @@ public Response validateDataset(@PathParam("id") String id, @QueryParam("variabl
}
return ok(msg);
}

// This API does the same thing as /validateDataFileHashValue/{fileId},
// but for all the files in the dataset, with streaming output.
@GET
@Path("validate/dataset/files/{id}")
@Produces({"application/json"})
public Response validateDatasetDatafiles(@PathParam("id") String id) {

// Streaming output: the API will start producing
// the output right away, as it goes through the list
// of the datafiles in the dataset.
// The streaming mechanism is modeled after validate/datasets API.
StreamingOutput stream = new StreamingOutput() {

@Override
public void write(OutputStream os) throws IOException,
WebApplicationException {
Dataset dataset;

try {
dataset = findDatasetOrDie(id);
} catch (Exception ex) {
throw new IOException(ex.getMessage());
}

os.write("{\"dataFiles\": [\n".getBytes());

boolean wroteObject = false;
for (DataFile dataFile : dataset.getFiles()) {
// Potentially, there's a godzillion datasets in this Dataverse.
// This is why we go through the list of ids here, and instantiate
// only one dataset at a time.
boolean success = false;
boolean constraintViolationDetected = false;

JsonObjectBuilder output = Json.createObjectBuilder();
output.add("datafileId", dataFile.getId());


try {
FileUtil.validateDataFileChecksum(dataFile);
success = true;
} catch (IOException ex) {
output.add("status", "invalid");
output.add("errorMessage", ex.getMessage());
}

if (success) {
output.add("status", "valid");
}

// write it out:

if (wroteObject) {
os.write(",\n".getBytes());
}

os.write(output.build().toString().getBytes("UTF8"));

if (!wroteObject) {
wroteObject = true;
}
}

os.write("\n]\n}\n".getBytes());
}

};
return Response.ok(stream).build();
}

@Path("assignments/assignees/{raIdtf: .*}")
@GET
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,58 +246,7 @@ private void validateDataFiles(Dataset dataset, CommandContext ctxt) throws Comm
// TODO: Should we validate all the files in the dataset, or only
// the files that haven't been published previously?
logger.log(Level.FINE, "validating DataFile {0}", dataFile.getId());

DataFile.ChecksumType checksumType = dataFile.getChecksumType();
if (checksumType == null) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.noChecksumType", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new Exception(info);
}

StorageIO<DataFile> storage = dataFile.getStorageIO();
storage.open(DataAccessOption.READ_ACCESS);
InputStream in = null;

if (!dataFile.isTabularData()) {
in = storage.getInputStream();
} else {
// if this is a tabular file, read the preserved original "auxiliary file"
// instead:
in = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
}

if (in == null) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failRead", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new Exception(info);
}

String recalculatedChecksum = null;
try {
recalculatedChecksum = FileUtil.calculateChecksum(in, checksumType);
} catch (RuntimeException rte) {
recalculatedChecksum = null;
} finally {
IOUtils.closeQuietly(in);
}

if (recalculatedChecksum == null) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failCalculateChecksum", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new Exception(info);
}

// TODO: What should we do if the datafile does not have a non-null checksum?
// Should we fail, or should we assume that the recalculated checksum
// is correct, and populate the checksumValue field with it?

if (!recalculatedChecksum.equals(dataFile.getChecksumValue())) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.wrongChecksumValue", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new Exception(info);
}

logger.log(Level.INFO, "successfully validated DataFile {0}; checksum {1}", new Object[]{dataFile.getId(), recalculatedChecksum});
FileUtil.validateDataFileChecksum(dataFile);
}
} catch (Throwable e) {

Expand Down
57 changes: 57 additions & 0 deletions src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@
import org.apache.commons.io.FilenameUtils;

import com.amazonaws.AmazonServiceException;
import edu.harvard.iq.dataverse.dataaccess.DataAccessOption;
import edu.harvard.iq.dataverse.dataaccess.StorageIO;
import java.util.Arrays;
import org.apache.commons.io.IOUtils;

/**
* a 4.0 implementation of the DVN FileUtil;
Expand Down Expand Up @@ -1675,6 +1679,59 @@ public static S3AccessIO getS3AccessForDirectUpload(Dataset dataset) {
return s3io;
}

public static void validateDataFileChecksum(DataFile dataFile) throws IOException {
DataFile.ChecksumType checksumType = dataFile.getChecksumType();
if (checksumType == null) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.noChecksumType", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new IOException(info);
}

StorageIO<DataFile> storage = dataFile.getStorageIO();
storage.open(DataAccessOption.READ_ACCESS);
InputStream in = null;

if (!dataFile.isTabularData()) {
in = storage.getInputStream();
} else {
// if this is a tabular file, read the preserved original "auxiliary file"
// instead:
in = storage.getAuxFileAsInputStream(FileUtil.SAVED_ORIGINAL_FILENAME_EXTENSION);
}

if (in == null) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failRead", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new IOException(info);
}

String recalculatedChecksum = null;
try {
recalculatedChecksum = FileUtil.calculateChecksum(in, checksumType);
} catch (RuntimeException rte) {
recalculatedChecksum = null;
} finally {
IOUtils.closeQuietly(in);
}

if (recalculatedChecksum == null) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.failCalculateChecksum", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new IOException(info);
}

// TODO: What should we do if the datafile does not have a non-null checksum?
// Should we fail, or should we assume that the recalculated checksum
// is correct, and populate the checksumValue field with it?
if (!recalculatedChecksum.equals(dataFile.getChecksumValue())) {
String info = BundleUtil.getStringFromBundle("dataset.publish.file.validation.error.wrongChecksumValue", Arrays.asList(dataFile.getId().toString()));
logger.log(Level.INFO, info);
throw new IOException(info);
}

logger.log(Level.INFO, "successfully validated DataFile {0}; checksum {1}", new Object[]{dataFile.getId(), recalculatedChecksum});
}

public static String getStorageIdentifierFromLocation(String location) {
int driverEnd = location.indexOf("://") + 3;
int bucketEnd = driverEnd + location.substring(driverEnd).indexOf("/");
Expand Down

0 comments on commit c3fbad2

Please sign in to comment.