From b27d266041cb973d66ce695633db774879eca131 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 22 Mar 2023 15:19:06 -0400 Subject: [PATCH] Changed json parser to modify the "storageidentifiers" of remote files harvested in the proprietary json format. #7736 --- .../api/imports/ImportServiceBean.java | 2 +- .../iq/dataverse/util/json/JsonParser.java | 30 ++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java index bee171950dc..cb6cef6ded5 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/imports/ImportServiceBean.java @@ -264,7 +264,7 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve JsonObject obj = jsonReader.readObject(); //and call parse Json to read it into a dataset try { - JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService); + JsonParser parser = new JsonParser(datasetfieldService, metadataBlockService, settingsService, licenseService, harvestingClient); parser.setLenient(true); Dataset ds = parser.parseDataset(obj); diff --git a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java index 22e2c6c8d78..af5b3595b6e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/json/JsonParser.java @@ -71,6 +71,7 @@ public class JsonParser { MetadataBlockServiceBean blockService; SettingsServiceBean settingsService; LicenseServiceBean licenseService; + HarvestingClient harvestingClient = null; /** * if lenient, we will accept alternate spellings for controlled vocabulary values @@ -85,10 +86,15 @@ public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceB } public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceBean blockService, SettingsServiceBean settingsService, LicenseServiceBean licenseService) { + this(datasetFieldSvc, blockService, settingsService, licenseService, null); + } + + public JsonParser(DatasetFieldServiceBean datasetFieldSvc, MetadataBlockServiceBean blockService, SettingsServiceBean settingsService, LicenseServiceBean licenseService, HarvestingClient harvestingClient) { this.datasetFieldSvc = datasetFieldSvc; this.blockService = blockService; this.settingsService = settingsService; this.licenseService = licenseService; + this.harvestingClient = harvestingClient; } public JsonParser() { @@ -522,7 +528,29 @@ public DataFile parseDataFile(JsonObject datafileJson) { if (contentType == null) { contentType = "application/octet-stream"; } - String storageIdentifier = datafileJson.getString("storageIdentifier", " "); + String storageIdentifier = null; + /** + * When harvesting from other Dataverses using this json format, we + * don't want to import their storageidentifiers verbatim. Instead, we + * will modify them to point to the access API location on the remote + * archive side. + */ + if (harvestingClient != null && datafileJson.containsKey("id")) { + String remoteId = datafileJson.getJsonNumber("id").toString(); + storageIdentifier = harvestingClient.getArchiveUrl() + + "/api/access/datafile/" + + remoteId; + /** + * Note that we don't have any practical use for these urls as + * of now. We used to, in the past, perform some tasks on harvested + * content that involved trying to access the files. In any event, it + * makes more sense to collect these urls, than the storage + * identifiers imported as is, which become completely meaningless + * on the local system. + */ + } else { + storageIdentifier = datafileJson.getString("storageIdentifier", null); + } JsonObject checksum = datafileJson.getJsonObject("checksum"); if (checksum != null) { // newer style that allows for SHA-1 rather than MD5