Merge pull request #8827 from poikilotherm/7000-mpconfig-files-basic

7000 mpconfig files basic
IQSS · Jan 18, 2023 · 1bef93a · 1bef93a
2 parents 4960f03 + b7aecf5
commit 1bef93a
Show file tree

Hide file tree

Showing 14 changed files with 122 additions and 97 deletions.
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -552,6 +552,8 @@ You should expect an HTTP 200 ("OK") response and JSON indicating the database I
 
 .. note:: Only a Dataverse installation account with superuser permissions is allowed to include files when creating a dataset via this API. Adding files this way only adds their file metadata to the database, you will need to manually add the physical files to the file system.
 
+.. _api-import-dataset:
+
 Import a Dataset into a Dataverse Collection
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst
@@ -288,7 +288,9 @@ To support multiple stores, a Dataverse installation now requires an id, type, a
 
 Out of the box, a Dataverse installation is configured to use local file storage in the 'file' store by default. You can add additional stores and, as a superuser, configure specific Dataverse collections to use them (by editing the 'General Information' for the Dataverse collection as described in the :doc:`/admin/dataverses-datasets` section).
 
-Note that the "\-Ddataverse.files.directory", if defined, continues to control where temporary files are stored (in the /temp subdir of that directory), independent of the location of any 'file' store defined above.
+Note that the "\-Ddataverse.files.directory", if defined, continues to control where temporary files are stored
+(in the /temp subdir of that directory), independent of the location of any 'file' store defined above.
+(See also the option reference: :ref:`dataverse.files.directory`)
 
 If you wish to change which store is used by default, you'll need to delete the existing default storage driver and set a new one using jvm options.
 
@@ -299,6 +301,8 @@ If you wish to change which store is used by default, you'll need to delete the
 
 It is also possible to set maximum file upload size limits per store. See the :ref:`:MaxFileUploadSizeInBytes` setting below.
 
+.. _storage-files-dir:
+
 File Storage
 ++++++++++++
 
@@ -1517,13 +1521,26 @@ protocol, host, and port number and should not include a trailing slash.
 - We are absolutely aware that it's confusing to have both ``dataverse.fqdn`` and ``dataverse.siteUrl``.
   https://github.com/IQSS/dataverse/issues/6636 is about resolving this confusion.
 
-
 .. _dataverse.files.directory:
 
 dataverse.files.directory
 +++++++++++++++++++++++++
 
-This is how you configure the path Dataverse uses for temporary files. (File store specific dataverse.files.\<id\>.directory options set the permanent data storage locations.)
+Please provide an absolute path to a directory backed by some mounted file system. This directory is used for a number
+of purposes:
+
+1. ``<dataverse.files.directory>/temp`` after uploading, data is temporarily stored here for ingest and/or before
+   shipping to the final storage destination.
+2. ``<dataverse.files.directory>/sword`` a place to store uploads via the :doc:`../api/sword` before transfer
+   to final storage location and/or ingest.
+3. ``<dataverse.files.directory>/googlecloudkey.json`` used with :ref:`Google Cloud Configuration` for BagIt exports.
+   This location is deprecated and might be refactored into a distinct setting in the future.
+4. The experimental DCM feature for :doc:`../developers/big-data-support` is able to trigger imports for externally
+   uploaded files in a directory tree at ``<dataverse.files.directory>/<PID Authority>/<PID Identifier>``
+   under certain conditions. This directory may also be used by file stores for :ref:`permanent file storage <storage-files-dir>`,
+   but this is controlled by other, store-specific settings.
+
+Defaults to ``/tmp/dataverse``. Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_FILES_DIRECTORY``.
 
 .. _dataverse.files.uploads:
 
@@ -3235,7 +3252,7 @@ For example:
 
 ``curl -X PUT -d "This content needs to go through an additional review by the Curation Team before it can be published." http://localhost:8080/api/admin/settings/:DatasetMetadataValidationFailureMsg``
 
-	
+
 :ExternalValidationAdminOverride
 ++++++++++++++++++++++++++++++++
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/Dataset.java b/src/main/java/edu/harvard/iq/dataverse/Dataset.java
@@ -33,6 +33,8 @@
 import javax.persistence.Table;
 import javax.persistence.Temporal;
 import javax.persistence.TemporalType;
+
+import edu.harvard.iq.dataverse.settings.JvmSettings;
 import edu.harvard.iq.dataverse.util.StringUtil;
 import edu.harvard.iq.dataverse.util.SystemConfig;
 
@@ -530,11 +532,8 @@ private Collection<String> getCategoryNames() {
     @Deprecated 
     public Path getFileSystemDirectory() {
         Path studyDir = null;
-
-        String filesRootDirectory = System.getProperty("dataverse.files.directory");
-        if (filesRootDirectory == null || filesRootDirectory.equals("")) {
-            filesRootDirectory = "/tmp/files";
-        }
+
+        String filesRootDirectory = JvmSettings.FILES_DIRECTORY.lookup();
 
         if (this.getAlternativePersistentIndentifiers() != null && !this.getAlternativePersistentIndentifiers().isEmpty()) {
             for (AlternativePersistentIdentifier api : this.getAlternativePersistentIndentifiers()) {

diff --git a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java
@@ -33,6 +33,7 @@
 import edu.harvard.iq.dataverse.ingest.IngestUtil;
 import edu.harvard.iq.dataverse.license.LicenseServiceBean;
 import edu.harvard.iq.dataverse.search.IndexServiceBean;
+import edu.harvard.iq.dataverse.settings.JvmSettings;
 import edu.harvard.iq.dataverse.settings.Setting;
 import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
 import edu.harvard.iq.dataverse.util.FileUtil;
@@ -2428,10 +2429,8 @@ public boolean isTemporaryPreviewAvailable(String fileSystemId, String mimeType)
             return false;
         }
 
-        String filesRootDirectory = System.getProperty("dataverse.files.directory");
-        if (filesRootDirectory == null || filesRootDirectory.isEmpty()) {
-            filesRootDirectory = "/tmp/files";
-        }
+        // Retrieve via MPCONFIG. Has sane default /tmp/dataverse from META-INF/microprofile-config.properties
+        String filesRootDirectory = JvmSettings.FILES_DIRECTORY.lookup();
 
         String fileSystemName = filesRootDirectory + "/temp/" + fileSystemId;
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/SwordConfigurationImpl.java b/src/main/java/edu/harvard/iq/dataverse/api/datadeposit/SwordConfigurationImpl.java
@@ -1,5 +1,6 @@
 package edu.harvard.iq.dataverse.api.datadeposit;
 
+import edu.harvard.iq.dataverse.settings.JvmSettings;
 import edu.harvard.iq.dataverse.util.SystemConfig;
 import java.io.File;
 import java.util.Arrays;
@@ -86,37 +87,32 @@ public boolean storeAndCheckBinary() {
 
     @Override
     public String getTempDirectory() {
-        String tmpFileDir = System.getProperty(SystemConfig.FILES_DIRECTORY);
-        if (tmpFileDir != null) {
-            String swordDirString = tmpFileDir + File.separator + "sword";
-            File swordDirFile = new File(swordDirString);
-            /**
-             * @todo Do we really need this check? It seems like we do because
-             * if you create a dataset via the native API and then later try to
-             * upload a file via SWORD, the directory defined by
-             * dataverse.files.directory may not exist and we get errors deep in
-             * the SWORD library code. Could maybe use a try catch in the doPost
-             * method of our SWORDv2MediaResourceServlet.
-             */
-            if (swordDirFile.exists()) {
+        // will throw a runtime exception when not found
+        String tmpFileDir = JvmSettings.FILES_DIRECTORY.lookup();
+
+        String swordDirString = tmpFileDir + File.separator + "sword";
+        File swordDirFile = new File(swordDirString);
+        /**
+         * @todo Do we really need this check? It seems like we do because
+         * if you create a dataset via the native API and then later try to
+         * upload a file via SWORD, the directory defined by
+         * dataverse.files.directory may not exist and we get errors deep in
+         * the SWORD library code. Could maybe use a try catch in the doPost
+         * method of our SWORDv2MediaResourceServlet.
+         */
+        if (swordDirFile.exists()) {
+            return swordDirString;
+        } else {
+            boolean mkdirSuccess = swordDirFile.mkdirs();
+            if (mkdirSuccess) {
+                logger.info("Created directory " + swordDirString);
                 return swordDirString;
             } else {
-                boolean mkdirSuccess = swordDirFile.mkdirs();
-                if (mkdirSuccess) {
-                    logger.info("Created directory " + swordDirString);
-                    return swordDirString;
-                } else {
-                    String msgForSwordUsers = ("Could not determine or create SWORD temp directory. Check logs for details.");
-                    logger.severe(msgForSwordUsers + " Failed to create " + swordDirString);
-                    // sadly, must throw RunTimeException to communicate with SWORD user
-                    throw new RuntimeException(msgForSwordUsers);
-                }
+                String msgForSwordUsers = ("Could not determine or create SWORD temp directory. Check logs for details.");
+                logger.severe(msgForSwordUsers + " Failed to create " + swordDirString);
+                // sadly, must throw RunTimeException to communicate with SWORD user
+                throw new RuntimeException(msgForSwordUsers);
             }
-        } else {
-            String msgForSwordUsers = ("JVM option \"" + SystemConfig.FILES_DIRECTORY + "\" not defined. Check logs for details.");
-            logger.severe(msgForSwordUsers);
-            // sadly, must throw RunTimeException to communicate with SWORD user
-            throw new RuntimeException(msgForSwordUsers);
         }
     }
 

diff --git a/...n/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordJobListener.java b/...n/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordJobListener.java
@@ -57,8 +57,10 @@
 import javax.inject.Named;
 import javax.servlet.http.HttpServletRequest;
 
+import edu.harvard.iq.dataverse.settings.JvmSettings;
 import org.apache.commons.io.IOUtils;
 
+import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.sql.Timestamp;
@@ -79,7 +81,7 @@
 @Dependent
 public class FileRecordJobListener implements ItemReadListener, StepListener, JobListener {
 
-    public static final String SEP = System.getProperty("file.separator");
+    public static final String SEP = File.separator;
 
     private static final UserNotification.Type notifyType = UserNotification.Type.FILESYSTEMIMPORT;
 
@@ -433,8 +435,10 @@ private void loadChecksumManifest() {
             manifest = checksumManifest;
             getJobLogger().log(Level.INFO, "Checksum manifest = " + manifest + " (FileSystemImportJob.xml property)");
         }
-        // construct full path
-        String manifestAbsolutePath = System.getProperty("dataverse.files.directory")
+
+        // Construct full path - retrieve base dir via MPCONFIG.
+        // (Has sane default /tmp/dataverse from META-INF/microprofile-config.properties)
+        String manifestAbsolutePath = JvmSettings.FILES_DIRECTORY.lookup()
                 + SEP + dataset.getAuthority()
                 + SEP + dataset.getIdentifier()
                 + SEP + uploadFolder

diff --git a/src/main/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordReader.java b/src/main/java/edu/harvard/iq/dataverse/batch/jobs/importer/filesystem/FileRecordReader.java
@@ -24,6 +24,7 @@
 import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean;
 import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser;
 import edu.harvard.iq.dataverse.batch.jobs.importer.ImportMode;
+import edu.harvard.iq.dataverse.settings.JvmSettings;
 import org.apache.commons.io.filefilter.NotFileFilter;
 import org.apache.commons.io.filefilter.WildcardFileFilter;
 
@@ -54,7 +55,7 @@
 @Dependent
 public class FileRecordReader extends AbstractItemReader {
 
-    public static final String SEP = System.getProperty("file.separator");
+    public static final String SEP = File.separator;
 
     @Inject
     JobContext jobContext;
@@ -96,9 +97,11 @@ public void init() {
 
     @Override
     public void open(Serializable checkpoint) throws Exception {
-
-        directory = new File(System.getProperty("dataverse.files.directory")
-                + SEP + dataset.getAuthority() + SEP + dataset.getIdentifier() + SEP + uploadFolder);
+
+        // Retrieve via MPCONFIG. Has sane default /tmp/dataverse from META-INF/microprofile-config.properties
+        String baseDir = JvmSettings.FILES_DIRECTORY.lookup();
+
+        directory = new File(baseDir + SEP + dataset.getAuthority() + SEP + dataset.getIdentifier() + SEP + uploadFolder);
         // TODO: 
         // The above goes directly to the filesystem directory configured by the 
         // old "dataverse.files.directory" JVM option (otherwise used for temp

diff --git a/src/main/java/edu/harvard/iq/dataverse/batch/util/LoggingUtil.java b/src/main/java/edu/harvard/iq/dataverse/batch/util/LoggingUtil.java
@@ -154,8 +154,8 @@ public static Logger getJobLogger(String jobId) {
 	    try {
 		    Logger jobLogger = Logger.getLogger("job-"+jobId);
 		    FileHandler fh;
-		    String logDir = System.getProperty("com.sun.aas.instanceRoot") + System.getProperty("file.separator") 
-			    + "logs" + System.getProperty("file.separator") + "batch-jobs" + System.getProperty("file.separator");
+		    String logDir = System.getProperty("com.sun.aas.instanceRoot") + File.separator
+			    + "logs" + File.separator + "batch-jobs" + File.separator;
 		    checkCreateLogDirectory( logDir );
 		    fh = new FileHandler(logDir + "job-" + jobId + ".log");
 		    logger.log(Level.INFO, "JOB LOG: " + logDir + "job-" + jobId + ".log");

diff --git a/.../java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/.../java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java
@@ -1,16 +1,27 @@
 package edu.harvard.iq.dataverse.engine.command.impl;
 
+import com.google.auth.oauth2.ServiceAccountCredentials;
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.Bucket;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageException;
+import com.google.cloud.storage.StorageOptions;
 import edu.harvard.iq.dataverse.Dataset;
-import edu.harvard.iq.dataverse.DatasetVersion;
 import edu.harvard.iq.dataverse.DatasetLock.Reason;
+import edu.harvard.iq.dataverse.DatasetVersion;
 import edu.harvard.iq.dataverse.authorization.Permission;
 import edu.harvard.iq.dataverse.authorization.users.ApiToken;
 import edu.harvard.iq.dataverse.engine.command.Command;
 import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
 import edu.harvard.iq.dataverse.engine.command.RequiredPermissions;
+import edu.harvard.iq.dataverse.settings.JvmSettings;
 import edu.harvard.iq.dataverse.workflow.step.Failure;
 import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult;
+import org.apache.commons.codec.binary.Hex;
 
+import javax.json.Json;
+import javax.json.JsonObjectBuilder;
+import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.PipedInputStream;
@@ -21,17 +32,6 @@
 import java.util.Map;
 import java.util.logging.Logger;
 
-import javax.json.Json;
-import javax.json.JsonObjectBuilder;
-
-import org.apache.commons.codec.binary.Hex;
-import com.google.auth.oauth2.ServiceAccountCredentials;
-import com.google.cloud.storage.Blob;
-import com.google.cloud.storage.Bucket;
-import com.google.cloud.storage.Storage;
-import com.google.cloud.storage.StorageException;
-import com.google.cloud.storage.StorageOptions;
-
 @RequiredPermissions(Permission.PublishDataset)
 public class GoogleCloudSubmitToArchiveCommand extends AbstractSubmitToArchiveCommand implements Command<DatasetVersion> {
 
@@ -56,10 +56,11 @@ public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken t
             statusObject.add(DatasetVersion.ARCHIVAL_STATUS, DatasetVersion.ARCHIVAL_STATUS_FAILURE);
             statusObject.add(DatasetVersion.ARCHIVAL_STATUS_MESSAGE, "Bag not transferred");
 
-            try {
-                FileInputStream fis = new FileInputStream(System.getProperty("dataverse.files.directory") + System.getProperty("file.separator") + "googlecloudkey.json");
+            String cloudKeyFile = JvmSettings.FILES_DIRECTORY.lookup() + File.separator + "googlecloudkey.json";
+
+            try (FileInputStream cloudKeyStream = new FileInputStream(cloudKeyFile)) {
                 storage = StorageOptions.newBuilder()
-                        .setCredentials(ServiceAccountCredentials.fromStream(fis))
+                        .setCredentials(ServiceAccountCredentials.fromStream(cloudKeyStream))
                         .setProjectId(projectName)
                         .build()
                         .getService();