diff --git a/doc/release-notes/10579-avoid-solr-deletes.md b/doc/release-notes/10579-avoid-solr-deletes.md new file mode 100644 index 00000000000..1062a2fb78f --- /dev/null +++ b/doc/release-notes/10579-avoid-solr-deletes.md @@ -0,0 +1,9 @@ +A features flag called "reduce-solr-deletes" has been added to improve how datafiles are indexed. When the flag is enabled, +Dataverse wil avoid pre-emptively deleting existing solr documents for the files prior to sending updated information. This +should improve performance and will allow additional optimizations going forward. + +The /api/admin/index/status and /api/admin/index/clear-orphans calls +(see https://guides.dataverse.org/en/latest/admin/solr-search-index.html#index-and-database-consistency) +will now find and remove (respectively) additional permissions related solr documents that were not being detected before. +Reducing the overall number of documents will improve solr performance and large sites may wish to periodically call the +clear-orphans API. \ No newline at end of file diff --git a/doc/sphinx-guides/source/developers/performance.rst b/doc/sphinx-guides/source/developers/performance.rst index 562fa330d75..6c864bec257 100644 --- a/doc/sphinx-guides/source/developers/performance.rst +++ b/doc/sphinx-guides/source/developers/performance.rst @@ -121,6 +121,7 @@ While in the past Solr performance hasn't been much of a concern, in recent year We are tracking performance problems in `#10469 `_. In a meeting with a Solr expert on 2024-05-10 we were advised to avoid joins as much as possible. (It was acknowledged that many Solr users make use of joins because they have to, like we do, to keep some documents private.) Toward that end we have added two feature flags called ``avoid-expensive-solr-join`` and ``add-publicobject-solr-field`` as explained under :ref:`feature-flags`. It was confirmed experimentally that performing the join on all the public objects (published collections, datasets and files), i.e., the bulk of the content in the search index, was indeed very expensive, especially on a large instance the size of the IQSS prod. archive, especially under indexing load. We confirmed that it was in fact unnecessary and were able to replace it with a boolean field directly in the indexed documents, which is achieved by the two feature flags above. However, as of writing this, this mechanism should still be considered experimental. +Another flag, ``reduce-solr-deletes``, avoids deleting solr documents for files in a dataset prior to sending updates. It also eliminates several causes of orphan permission documents. This is expected to improve indexing performance to some extent and is a step towards avoiding unnecessary updates (i.e. when a doc would not change). Datasets with Large Numbers of Files or Versions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 213ac827819..e6f6a49a499 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -3274,6 +3274,9 @@ please find all known feature flags below. Any of these flags can be activated u * - add-publicobject-solr-field - Adds an extra boolean field `PublicObject_b:true` for public content (published Collections, Datasets and Files). Once reindexed with these fields, we can rely on it to remove a very expensive Solr join on all such documents in Solr queries, significantly improving overall performance (by enabling the feature flag above, `avoid-expensive-solr-join`). These two flags are separate so that an instance can reindex their holdings before enabling the optimization in searches, thus avoiding having their public objects temporarily disappear from search results while the reindexing is in progress. - ``Off`` + * - reduce-solr-deletes + - Avoids deleting and recreating solr documents for dataset files when reindexing. + - ``Off`` **Note:** Feature flags can be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_FEATURE_XXX`` (e.g. ``DATAVERSE_FEATURE_API_SESSION_AUTH=1``). These environment variables can be set in your shell before starting Payara. If you are using :doc:`Docker for development `, you can set them in the `docker compose `_ file. diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 41ea6ae39f0..21f925f8981 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -1,5 +1,6 @@ package edu.harvard.iq.dataverse; +import edu.harvard.iq.dataverse.DatasetVersion.VersionState; import edu.harvard.iq.dataverse.authorization.users.AuthenticatedUser; import edu.harvard.iq.dataverse.dataaccess.DataAccess; import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter; @@ -383,7 +384,8 @@ public FileMetadata findMostRecentVersionFileIsIn(DataFile file) { if (fileMetadatas == null || fileMetadatas.isEmpty()) { return null; } else { - return fileMetadatas.get(0); + // This assumes the order of filemetadatas is from first to most recent, which is true as of v6.3 + return fileMetadatas.get(fileMetadatas.size() - 1); } } @@ -759,6 +761,13 @@ public List findAll() { return em.createQuery("select object(o) from DataFile as o order by o.id", DataFile.class).getResultList(); } + public List findVersionStates(Long fileId) { + Query query = em.createQuery( + "select distinct dv.versionState from DatasetVersion dv where dv.id in (select fm.datasetVersion.id from FileMetadata fm where fm.dataFile.id=:fileId)"); + query.setParameter("fileId", fileId); + return query.getResultList(); + } + public DataFile save(DataFile dataFile) { if (dataFile.isMergeable()) { diff --git a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java index 0102459ab9f..63ce35114e2 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.search; import edu.harvard.iq.dataverse.*; +import edu.harvard.iq.dataverse.DatasetVersion.VersionState; import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; import edu.harvard.iq.dataverse.authorization.providers.builtin.BuiltinUserServiceBean; import edu.harvard.iq.dataverse.batch.util.LoggingUtil; @@ -12,6 +13,7 @@ import edu.harvard.iq.dataverse.datavariable.VariableMetadataUtil; import edu.harvard.iq.dataverse.datavariable.VariableServiceBean; import edu.harvard.iq.dataverse.harvest.client.HarvestingClient; +import edu.harvard.iq.dataverse.search.IndexableDataset.DatasetState; import edu.harvard.iq.dataverse.settings.FeatureFlags; import edu.harvard.iq.dataverse.settings.JvmSettings; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; @@ -38,6 +40,7 @@ import java.util.concurrent.Future; import java.util.concurrent.Semaphore; import java.util.function.Function; +import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; import jakarta.annotation.PostConstruct; @@ -474,94 +477,160 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr * @todo should we use solrDocIdentifierDataset or * IndexableObject.IndexableTypes.DATASET.getName() + "_" ? */ - // String solrIdPublished = solrDocIdentifierDataset + dataset.getId(); String solrIdPublished = determinePublishedDatasetSolrDocId(dataset); String solrIdDraftDataset = IndexableObject.IndexableTypes.DATASET.getName() + "_" + dataset.getId() + IndexableDataset.DatasetState.WORKING_COPY.getSuffix(); - // String solrIdDeaccessioned = IndexableObject.IndexableTypes.DATASET.getName() - // + "_" + dataset.getId() + - // IndexableDataset.DatasetState.DEACCESSIONED.getSuffix(); String solrIdDeaccessioned = determineDeaccessionedDatasetId(dataset); StringBuilder debug = new StringBuilder(); debug.append("\ndebug:\n"); - int numPublishedVersions = 0; - List versions = dataset.getVersions(); - List solrIdsOfFilesToDelete = new ArrayList<>(); - for (DatasetVersion datasetVersion : versions) { - Long versionDatabaseId = datasetVersion.getId(); - String versionTitle = datasetVersion.getTitle(); - String semanticVersion = datasetVersion.getSemanticVersion(); - DatasetVersion.VersionState versionState = datasetVersion.getVersionState(); - if (versionState.equals(DatasetVersion.VersionState.RELEASED)) { - numPublishedVersions += 1; - } - debug.append("version found with database id " + versionDatabaseId + "\n"); - debug.append("- title: " + versionTitle + "\n"); - debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n"); - List fileMetadatas = datasetVersion.getFileMetadatas(); - List fileInfo = new ArrayList<>(); - for (FileMetadata fileMetadata : fileMetadatas) { - String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId(); - /** - * It sounds weird but the first thing we'll do is preemptively - * delete the Solr documents of all published files. Don't - * worry, published files will be re-indexed later along with - * the dataset. We do this so users can delete files from - * published versions of datasets and then re-publish a new - * version without fear that their old published files (now - * deleted from the latest published version) will be - * searchable. See also - * https://github.com/IQSS/dataverse/issues/762 - */ - solrIdsOfFilesToDelete.add(solrIdOfPublishedFile); - fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); - } - try { - /** - * Preemptively delete *all* Solr documents for files associated - * with the dataset based on a Solr query. - * - * We must query Solr for this information because the file has - * been deleted from the database ( perhaps when Solr was down, - * as reported in https://github.com/IQSS/dataverse/issues/2086 - * ) so the database doesn't even know about the file. It's an - * orphan. - * - * @todo This Solr query should make the iteration above based - * on the database unnecessary because it the Solr query should - * find all files for the dataset. We can probably remove the - * iteration above after an "index all" has been performed. - * Without an "index all" we won't be able to find files based - * on parentId because that field wasn't searchable in 4.0. - * - * @todo We should also delete the corresponding Solr - * "permission" documents for the files. - */ - List allFilesForDataset = findFilesOfParentDataset(dataset.getId()); - solrIdsOfFilesToDelete.addAll(allFilesForDataset); - } catch (SearchException | NullPointerException ex) { - logger.fine("could not run search of files to delete: " + ex); + boolean reduceSolrDeletes = FeatureFlags.REDUCE_SOLR_DELETES.enabled(); + if (!reduceSolrDeletes) { + int numPublishedVersions = 0; + List versions = dataset.getVersions(); + List solrIdsOfFilesToDelete = new ArrayList<>(); + for (DatasetVersion datasetVersion : versions) { + Long versionDatabaseId = datasetVersion.getId(); + String versionTitle = datasetVersion.getTitle(); + String semanticVersion = datasetVersion.getSemanticVersion(); + DatasetVersion.VersionState versionState = datasetVersion.getVersionState(); + if (versionState.equals(DatasetVersion.VersionState.RELEASED)) { + numPublishedVersions += 1; + } + debug.append("version found with database id " + versionDatabaseId + "\n"); + debug.append("- title: " + versionTitle + "\n"); + debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n"); + List fileMetadatas = datasetVersion.getFileMetadatas(); + List fileInfo = new ArrayList<>(); + for (FileMetadata fileMetadata : fileMetadatas) { + String solrIdOfPublishedFile = solrDocIdentifierFile + fileMetadata.getDataFile().getId(); + /** + * It sounds weird but the first thing we'll do is preemptively + * delete the Solr documents of all published files. Don't + * worry, published files will be re-indexed later along with + * the dataset. We do this so users can delete files from + * published versions of datasets and then re-publish a new + * version without fear that their old published files (now + * deleted from the latest published version) will be + * searchable. See also + * https://github.com/IQSS/dataverse/issues/762 + */ + solrIdsOfFilesToDelete.add(solrIdOfPublishedFile); + fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); + } + try { + /** + * Preemptively delete *all* Solr documents for files associated + * with the dataset based on a Solr query. + * + * We must query Solr for this information because the file has + * been deleted from the database ( perhaps when Solr was down, + * as reported in https://github.com/IQSS/dataverse/issues/2086 + * ) so the database doesn't even know about the file. It's an + * orphan. + * + * @todo This Solr query should make the iteration above based + * on the database unnecessary because it the Solr query should + * find all files for the dataset. We can probably remove the + * iteration above after an "index all" has been performed. + * Without an "index all" we won't be able to find files based + * on parentId because that field wasn't searchable in 4.0. + * + * @todo We should also delete the corresponding Solr + * "permission" documents for the files. + */ + List allFilesForDataset = findFilesOfParentDataset(dataset.getId()); + solrIdsOfFilesToDelete.addAll(allFilesForDataset); + } catch (SearchException | NullPointerException ex) { + logger.fine("could not run search of files to delete: " + ex); + } + int numFiles = 0; + if (fileMetadatas != null) { + numFiles = fileMetadatas.size(); + } + debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); } - int numFiles = 0; - if (fileMetadatas != null) { - numFiles = fileMetadatas.size(); + debug.append("numPublishedVersions: " + numPublishedVersions + "\n"); + if (doNormalSolrDocCleanUp) { + IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete); + debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n"); } - debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); - } - debug.append("numPublishedVersions: " + numPublishedVersions + "\n"); - if (doNormalSolrDocCleanUp) { - IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService.deleteMultipleSolrIds(solrIdsOfFilesToDelete); - debug.append("result of attempt to premptively deleted published files before reindexing: " + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n"); } DatasetVersion latestVersion = dataset.getLatestVersion(); - String latestVersionStateString = latestVersion.getVersionState().name(); DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState(); + String latestVersionStateString = latestVersionState.name(); DatasetVersion releasedVersion = dataset.getReleasedVersion(); boolean atLeastOnePublishedVersion = false; if (releasedVersion != null) { atLeastOnePublishedVersion = true; - } else { - atLeastOnePublishedVersion = false; } + if (reduceSolrDeletes) { + List solrIdsOfDocsToDelete = null; + if (logger.isLoggable(Level.FINE)) { + writeDebugInfo(debug, dataset); + } + if (doNormalSolrDocCleanUp) { + try { + solrIdsOfDocsToDelete = findFilesOfParentDataset(dataset.getId()); + logger.fine("Existing file docs: " + String.join(", ", solrIdsOfDocsToDelete)); + if (!solrIdsOfDocsToDelete.isEmpty()) { + // We keep the latest version's docs unless it is deaccessioned and there is no + // published/released version + // So skip the loop removing those docs from the delete list except in that case + if ((!latestVersion.isDeaccessioned() || atLeastOnePublishedVersion)) { + List latestFileMetadatas = latestVersion.getFileMetadatas(); + String suffix = (new IndexableDataset(latestVersion)).getDatasetState().getSuffix(); + for (FileMetadata fileMetadata : latestFileMetadatas) { + String solrIdOfPublishedFile = solrDocIdentifierFile + + fileMetadata.getDataFile().getId() + suffix; + solrIdsOfDocsToDelete.remove(solrIdOfPublishedFile); + } + } + if (releasedVersion != null && !releasedVersion.equals(latestVersion)) { + List releasedFileMetadatas = releasedVersion.getFileMetadatas(); + for (FileMetadata fileMetadata : releasedFileMetadatas) { + String solrIdOfPublishedFile = solrDocIdentifierFile + + fileMetadata.getDataFile().getId(); + solrIdsOfDocsToDelete.remove(solrIdOfPublishedFile); + } + } + } + // Clear any unused dataset docs + if (!latestVersion.isDraft()) { + // The latest version is released, so should delete any draft docs for the + // dataset + solrIdsOfDocsToDelete.add(solrIdDraftDataset); + } + if (!atLeastOnePublishedVersion) { + // There's no released version, so should delete any normal state docs for the + // dataset + solrIdsOfDocsToDelete.add(solrIdPublished); + } + if (atLeastOnePublishedVersion || !latestVersion.isDeaccessioned()) { + // There's a released version or a draft, so should delete any deaccessioned + // state docs for the dataset + solrIdsOfDocsToDelete.add(solrIdDeaccessioned); + } + } catch (SearchException | NullPointerException ex) { + logger.fine("could not run search of files to delete: " + ex); + } + logger.fine("Solr docs to delete: " + String.join(", ", solrIdsOfDocsToDelete)); + + if (!solrIdsOfDocsToDelete.isEmpty()) { + List solrIdsOfPermissionDocsToDelete = new ArrayList<>(); + for (String file : solrIdsOfDocsToDelete) { + // Also remove associated permission docs + solrIdsOfPermissionDocsToDelete.add(file + discoverabilityPermissionSuffix); + } + solrIdsOfDocsToDelete.addAll(solrIdsOfPermissionDocsToDelete); + logger.fine("Solr docs and perm docs to delete: " + String.join(", ", solrIdsOfDocsToDelete)); + + IndexResponse resultOfAttemptToPremptivelyDeletePublishedFiles = solrIndexService + .deleteMultipleSolrIds(solrIdsOfDocsToDelete); + debug.append("result of attempt to premptively deleted published files before reindexing: " + + resultOfAttemptToPremptivelyDeletePublishedFiles + "\n"); + } + } + } + Map desiredCards = new LinkedHashMap<>(); /** * @todo refactor all of this below and have a single method that takes @@ -584,7 +653,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr .append(indexDraftResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("Draft exists, no need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(" (and files). Result: ") @@ -592,7 +661,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr } desiredCards.put(DatasetVersion.VersionState.RELEASED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deletePublishedResults = removePublished(dataset); results.append("No published version. Attempting to delete traces of published version from index. Result: ") .append(deletePublishedResults).append("\n"); @@ -635,13 +704,13 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr results.append("No draft version. Attempting to index as deaccessioned. Result: ").append(indexDeaccessionedVersionResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.RELEASED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deletePublishedResults = removePublished(dataset); results.append("No published version. Attempting to delete traces of published version from index. Result: ").append(deletePublishedResults).append("\n"); } desiredCards.put(DatasetVersion.VersionState.DRAFT, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { List solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset); String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset); String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete); @@ -689,7 +758,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr results.append("Attempted to index " + solrIdPublished).append(". Result: ").append(indexReleasedVersionResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DRAFT, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { List solrDocIdsForDraftFilesToDelete = findSolrDocIdsForDraftFilesToDelete(dataset); String deleteDraftDatasetVersionResult = removeSolrDocFromIndex(solrIdDraftDataset); String deleteDraftFilesResults = deleteDraftFiles(solrDocIdsForDraftFilesToDelete); @@ -698,7 +767,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr } desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("No need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult); @@ -749,7 +818,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr .append(solrIdDraftDataset).append(" (limited visibility). Result: ").append(indexDraftResult).append("\n"); desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - if (doNormalSolrDocCleanUp) { + if (!reduceSolrDeletes && doNormalSolrDocCleanUp) { String deleteDeaccessionedResult = removeDeaccessioned(dataset); results.append("No need for deaccessioned version. Deletion attempted for ") .append(solrIdDeaccessioned).append(". Result: ").append(deleteDeaccessionedResult); @@ -791,11 +860,42 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr } } - private String deleteDraftFiles(List solrDocIdsForDraftFilesToDelete) { - String deleteDraftFilesResults = ""; - IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(solrDocIdsForDraftFilesToDelete); - deleteDraftFilesResults = indexResponse.toString(); - return deleteDraftFilesResults; + private void writeDebugInfo(StringBuilder debug, Dataset dataset) { + List versions = dataset.getVersions(); + int numPublishedVersions = 0; + for (DatasetVersion datasetVersion : versions) { + Long versionDatabaseId = datasetVersion.getId(); + String versionTitle = datasetVersion.getTitle(); + String semanticVersion = datasetVersion.getSemanticVersion(); + DatasetVersion.VersionState versionState = datasetVersion.getVersionState(); + if (versionState.equals(DatasetVersion.VersionState.RELEASED)) { + numPublishedVersions += 1; + } + debug.append("version found with database id " + versionDatabaseId + "\n"); + debug.append("- title: " + versionTitle + "\n"); + debug.append("- semanticVersion-VersionState: " + semanticVersion + "-" + versionState + "\n"); + List fileInfo = new ArrayList<>(); + List fileMetadatas = datasetVersion.getFileMetadatas(); + + for (FileMetadata fileMetadata : fileMetadatas) { + /** + * It sounds weird but the first thing we'll do is preemptively delete the Solr + * documents of all published files. Don't worry, published files will be + * re-indexed later along with the dataset. We do this so users can delete files + * from published versions of datasets and then re-publish a new version without + * fear that their old published files (now deleted from the latest published + * version) will be searchable. See also + * https://github.com/IQSS/dataverse/issues/762 + */ + fileInfo.add(fileMetadata.getDataFile().getId() + ":" + fileMetadata.getLabel()); + } + int numFiles = 0; + if (fileMetadatas != null) { + numFiles = fileMetadatas.size(); + } + debug.append("- files: " + numFiles + " " + fileInfo.toString() + "\n"); + } + debug.append("numPublishedVersions: " + numPublishedVersions + "\n"); } private IndexResponse indexDatasetPermissions(Dataset dataset) { @@ -873,14 +973,14 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set solrDocIdsForDraftFilesToDelete) { + String deleteDraftFilesResults = ""; + IndexResponse indexResponse = solrIndexService.deleteMultipleSolrIds(solrDocIdsForDraftFilesToDelete); + deleteDraftFilesResults = indexResponse.toString(); + return deleteDraftFilesResults; + } private Dataverse findRootDataverseCached() { if (true) { @@ -2086,8 +2194,50 @@ public List findPermissionsInSolrOnly() throws SearchException { SolrDocumentList list = rsp.getResults(); for (SolrDocument doc: list) { long id = Long.parseLong((String) doc.getFieldValue(SearchFields.DEFINITION_POINT_DVOBJECT_ID)); + String docId = (String)doc.getFieldValue(SearchFields.ID); if(!dvObjectService.checkExists(id)) { - permissionInSolrOnly.add((String)doc.getFieldValue(SearchFields.ID)); + permissionInSolrOnly.add(docId); + } else { + DvObject obj = dvObjectService.findDvObject(id); + if (obj instanceof Dataset d) { + DatasetVersion dv = d.getLatestVersion(); + if (docId.endsWith("draft_permission")) { + if (!dv.isDraft()) { + permissionInSolrOnly.add(docId); + } + } else if (docId.endsWith("deaccessioned_permission")) { + if (!dv.isDeaccessioned()) { + permissionInSolrOnly.add(docId); + } + } else { + if (d.getReleasedVersion() == null) { + permissionInSolrOnly.add(docId); + } + } + } else if (obj instanceof DataFile f) { + List states = dataFileService.findVersionStates(f.getId()); + Set strings = states.stream().map(VersionState::toString).collect(Collectors.toSet()); + logger.fine("States for " + docId + ": " + String.join(", ", strings)); + if (docId.endsWith("draft_permission")) { + if (!states.contains(VersionState.DRAFT)) { + permissionInSolrOnly.add(docId); + } + } else if (docId.endsWith("deaccessioned_permission")) { + if (!states.contains(VersionState.DEACCESSIONED) && states.size() == 1) { + permissionInSolrOnly.add(docId); + } + } else { + if (!states.contains(VersionState.RELEASED)) { + permissionInSolrOnly.add(docId); + } else { + if(dataFileService.findFileMetadataByDatasetVersionIdAndDataFileId(f.getOwner().getReleasedVersion().getId(), f.getId()) == null) { + logger.fine("Adding doc " + docId + " to list of permissions in Solr only"); + permissionInSolrOnly.add(docId); + } + } + + } + } } } if (cursorMark.equals(nextCursorMark)) { diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java index 14a7ab86f22..d523bf92e63 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/FeatureFlags.java @@ -58,6 +58,19 @@ public enum FeatureFlags { * @since Dataverse 6.3 */ ADD_PUBLICOBJECT_SOLR_FIELD("add-publicobject-solr-field"), + /** + * Dataverse normally deletes all solr documents related to a dataset's files + * when the dataset is reindexed. With this flag enabled, additional logic is + * added to the reindex process to delete only the solr documents that are no + * longer needed. (Required docs will be updated rather than deleted and + * replaced.) Enabling this feature flag should make the reindex process + * faster without impacting the search results. + * + * @apiNote Raise flag by setting + * "dataverse.feature.reduce-solr-deletes" + * @since Dataverse 6.3 + */ + REDUCE_SOLR_DELETES("reduce-solr-deletes"), ; final String flag;