From 5c3225692dcea1eddbb3e76ae19f47de5ea23a96 Mon Sep 17 00:00:00 2001 From: Chaitanya Gohel <104654647+gashutos@users.noreply.github.com> Date: Mon, 12 Jun 2023 22:29:32 +0530 Subject: [PATCH] Adding back [Time series based workload desc order optimization through reverse segment read (#7244)] with fixes (#7967) * Revert "Revert "Time series based workload desc order optimization through reverse segment read (#7244)" (#7892)" This reverts commit bb265369d67431a7aa15efca8734326857db7e32. Signed-off-by: gashutos * Enable time series optimization only if it is not IndexSorted index, also ASC order reverse should only consider in @timestamp field Signed-off-by: gashutos * Modifying CHANGELOG Signed-off-by: gashutos * Adding integ test for scroll API where sort by _doc is getting early termination Signed-off-by: gashutos --------- Signed-off-by: gashutos --- CHANGELOG.md | 2 + .../test/scroll/10_basic_timeseries.yml | 161 ++++++++++++++++++ .../cluster/metadata/DataStream.java | 23 +++ .../org/opensearch/index/IndexSettings.java | 12 ++ .../opensearch/index/engine/EngineConfig.java | 19 +++ .../index/engine/EngineConfigFactory.java | 6 +- .../index/engine/InternalEngine.java | 3 + .../index/mapper/MappingLookup.java | 10 ++ .../opensearch/index/shard/IndexShard.java | 17 +- .../search/internal/ContextIndexSearcher.java | 34 +++- .../engine/EngineConfigFactoryTests.java | 6 +- .../test/OpenSearchIntegTestCase.java | 1 + 12 files changed, 288 insertions(+), 6 deletions(-) create mode 100644 rest-api-spec/src/main/resources/rest-api-spec/test/scroll/10_basic_timeseries.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index e3418e4112f63..379966b44e9a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Add task cancellation monitoring service ([#7642](https://github.com/opensearch-project/OpenSearch/pull/7642)) - Add TokenManager Interface ([#7452](https://github.com/opensearch-project/OpenSearch/pull/7452)) - Add Remote store as a segment replication source ([#7653](https://github.com/opensearch-project/OpenSearch/pull/7653)) +- Add descending order search optimization through reverse segment read. ([#7967](https://github.com/opensearch-project/OpenSearch/pull/7967)) + ### Dependencies - Bump `jackson` from 2.15.1 to 2.15.2 ([#7897](https://github.com/opensearch-project/OpenSearch/pull/7897)) diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/scroll/10_basic_timeseries.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/scroll/10_basic_timeseries.yml new file mode 100644 index 0000000000000..1995bee89a0a4 --- /dev/null +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/scroll/10_basic_timeseries.yml @@ -0,0 +1,161 @@ +--- +"Basic scroll on time series workload for reversed leaf sorter": + - do: + indices.create: + index: test_scroll_time_series + body: + mappings: + properties: + name: + type: keyword + '@timestamp': + type: date + + - do: + bulk: + refresh: true + index: test_scroll_time_series + body: + - '{"index": {}}' + - '{"name": "1", "@timestamp": "2010-03-12T01:07:00"}' + - '{"index": {}}' + - '{"name": "2", "@timestamp": "2010-03-12T01:07:01"}' + - '{"index": {}}' + - '{"name": "3", "@timestamp": "2010-03-12T01:07:02"}' + - '{"index": {}}' + - '{"name": "4", "@timestamp": "2010-03-12T01:07:03"}' + - '{"index": {}}' + - '{"name": "5", "@timestamp": "2010-03-12T01:07:04"}' + - '{"index": {}}' + - '{"name": "6", "@timestamp": "2010-03-12T01:07:05"}' + - '{"index": {}}' + - '{"name": "7", "@timestamp": "2010-03-12T01:07:06"}' + - '{"index": {}}' + - '{"name": "8", "@timestamp": "2010-03-12T01:07:07"}' + - '{"index": {}}' + - '{"name": "9", "@timestamp": "2010-03-12T01:07:08"}' + - '{"index": {}}' + - '{"name": "10", "@timestamp": "2010-03-12T01:07:09"}' + - do: + indices.refresh: {} + - do: + bulk: + refresh: true + index: test_scroll_time_series + body: + - '{"index": {}}' + - '{"name": "11", "@timestamp": "2010-03-12T01:07:10"}' + - '{"index": {}}' + - '{"name": "12", "@timestamp": "2010-03-12T01:07:11"}' + - '{"index": {}}' + - '{"name": "13", "@timestamp": "2010-03-12T01:07:12"}' + - '{"index": {}}' + - '{"name": "14", "@timestamp": "2010-03-12T01:07:13"}' + - '{"index": {}}' + - '{"name": "15", "@timestamp": "2010-03-12T01:07:14"}' + - '{"index": {}}' + - '{"name": "16", "@timestamp": "2010-03-12T01:07:15"}' + - '{"index": {}}' + - '{"name": "17", "@timestamp": "2010-03-12T01:07:16"}' + - '{"index": {}}' + - '{"name": "18", "@timestamp": "2010-03-12T01:07:17"}' + - '{"index": {}}' + - '{"name": "19", "@timestamp": "2010-03-12T01:07:18"}' + - '{"index": {}}' + - '{"name": "20", "@timestamp": "2010-03-12T01:07:19"}' + - do: + indices.refresh: { } + - do: + bulk: + refresh: true + index: test_scroll_time_series + body: + - '{"index": {}}' + - '{"name": "21", "@timestamp": "2010-03-12T01:07:20"}' + - '{"index": {}}' + - '{"name": "22", "@timestamp": "2010-03-12T01:07:21"}' + - '{"index": {}}' + - '{"name": "23", "@timestamp": "2010-03-12T01:07:22"}' + - '{"index": {}}' + - '{"name": "24", "@timestamp": "2010-03-12T01:07:23"}' + - '{"index": {}}' + - '{"name": "25", "@timestamp": "2010-03-12T01:07:24"}' + - '{"index": {}}' + - '{"name": "26", "@timestamp": "2010-03-12T01:07:25"}' + - '{"index": {}}' + - '{"name": "27", "@timestamp": "2010-03-12T01:07:26"}' + - '{"index": {}}' + - '{"name": "28", "@timestamp": "2010-03-12T01:07:27"}' + - '{"index": {}}' + - '{"name": "29", "@timestamp": "2010-03-12T01:07:28"}' + - '{"index": {}}' + - '{"name": "30", "@timestamp": "2010-03-12T01:07:29"}' + - do: + indices.refresh: { } + + - do: + search: + rest_total_hits_as_int: true + index: test_scroll_time_series + size: 5 + scroll: 1m + sort: _doc + body: + query: + match_all: {} + + - set: {_scroll_id: scroll_id} + - match: {hits.total: 30 } + - length: {hits.hits: 5 } + + - do: + scroll: + rest_total_hits_as_int: true + body: { "scroll_id": "$scroll_id", "scroll": "1m"} + + - match: {hits.total: 30 } + - length: {hits.hits: 5 } + + - do: + scroll: + rest_total_hits_as_int: true + body: { "scroll_id": "$scroll_id", "scroll": "1m" } + + - match: { hits.total: 30 } + - length: { hits.hits: 5 } + + - do: + scroll: + rest_total_hits_as_int: true + body: { "scroll_id": "$scroll_id", "scroll": "1m" } + + - match: { hits.total: 30 } + - length: { hits.hits: 5 } + + - do: + scroll: + rest_total_hits_as_int: true + body: { "scroll_id": "$scroll_id", "scroll": "1m" } + + - match: { hits.total: 30 } + - length: { hits.hits: 5 } + + - do: + scroll: + rest_total_hits_as_int: true + body: { "scroll_id": "$scroll_id", "scroll": "1m" } + + - match: { hits.total: 30 } + - length: { hits.hits: 5 } + + - do: + scroll: + rest_total_hits_as_int: true + body: { "scroll_id": "$scroll_id", "scroll": "1m" } + + - match: { hits.total: 30 } + - length: { hits.hits: 0 } + + - do: + clear_scroll: + scroll_id: $scroll_id diff --git a/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java b/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java index 825aaee1ad1f8..f4be1cfff489c 100644 --- a/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java +++ b/server/src/main/java/org/opensearch/cluster/metadata/DataStream.java @@ -31,6 +31,10 @@ package org.opensearch.cluster.metadata; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.PointValues; +import org.opensearch.OpenSearchException; import org.opensearch.cluster.AbstractDiffable; import org.opensearch.cluster.Diff; import org.opensearch.core.ParseField; @@ -46,6 +50,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.Map; @@ -59,6 +64,24 @@ public final class DataStream extends AbstractDiffable implements ToXContentObject { public static final String BACKING_INDEX_PREFIX = ".ds-"; + public static final String TIMESERIES_FIELDNAME = "@timestamp"; + public static final Comparator TIMESERIES_LEAF_SORTER = Comparator.comparingLong((LeafReader r) -> { + try { + PointValues points = r.getPointValues(TIMESERIES_FIELDNAME); + if (points != null) { + // could be a multipoint (probably not) but get the maximum time value anyway + byte[] sortValue = points.getMaxPackedValue(); + // decode the first dimension because this should not be a multi dimension field + // it's a bug in the date field if it is + return LongPoint.decodeDimension(sortValue, 0); + } else { + // segment does not have a timestamp field, just return the minimum value + return Long.MIN_VALUE; + } + } catch (IOException e) { + throw new OpenSearchException("Not a timeseries Index! Field [{}] not found!", TIMESERIES_FIELDNAME); + } + }).reversed(); private final String name; private final TimestampField timeStampField; diff --git a/server/src/main/java/org/opensearch/index/IndexSettings.java b/server/src/main/java/org/opensearch/index/IndexSettings.java index e125facb76059..de7dc102939ce 100644 --- a/server/src/main/java/org/opensearch/index/IndexSettings.java +++ b/server/src/main/java/org/opensearch/index/IndexSettings.java @@ -665,6 +665,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) { private volatile long mappingTotalFieldsLimit; private volatile long mappingDepthLimit; private volatile long mappingFieldNameLengthLimit; + private volatile boolean searchSegmentOrderReversed; /** * The maximum number of refresh listeners allows on this shard. @@ -905,6 +906,10 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti scopedSettings.addSettingsUpdateConsumer(DEFAULT_SEARCH_PIPELINE, this::setDefaultSearchPipeline); } + private void setSearchSegmentOrderReversed(boolean reversed) { + this.searchSegmentOrderReversed = reversed; + } + private void setSearchIdleAfter(TimeValue searchIdleAfter) { this.searchIdleAfter = searchIdleAfter; } @@ -1084,6 +1089,13 @@ public Settings getNodeSettings() { return nodeSettings; } + /** + * Returns true if index level setting for leaf reverse order search optimization is enabled + */ + public boolean getSearchSegmentOrderReversed() { + return this.searchSegmentOrderReversed; + } + /** * Updates the settings and index metadata and notifies all registered settings consumers with the new settings iff at least one * setting has changed. diff --git a/server/src/main/java/org/opensearch/index/engine/EngineConfig.java b/server/src/main/java/org/opensearch/index/engine/EngineConfig.java index fe003405fd3f8..338a541af387a 100644 --- a/server/src/main/java/org/opensearch/index/engine/EngineConfig.java +++ b/server/src/main/java/org/opensearch/index/engine/EngineConfig.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; @@ -59,6 +60,7 @@ import org.opensearch.indices.breaker.CircuitBreakerService; import org.opensearch.threadpool.ThreadPool; +import java.util.Comparator; import java.util.List; import java.util.Objects; import java.util.function.BooleanSupplier; @@ -102,6 +104,7 @@ public final class EngineConfig { private final Supplier retentionLeasesSupplier; private final boolean isReadOnlyReplica; private final BooleanSupplier primaryModeSupplier; + private final Comparator leafSorter; /** * A supplier of the outstanding retention leases. This is used during merged operations to determine which operations that have been @@ -204,6 +207,7 @@ private EngineConfig(Builder builder) { this.isReadOnlyReplica = builder.isReadOnlyReplica; this.primaryModeSupplier = builder.primaryModeSupplier; this.translogFactory = builder.translogFactory; + this.leafSorter = builder.leafSorter; } /** @@ -451,6 +455,15 @@ public TranslogDeletionPolicyFactory getCustomTranslogDeletionPolicyFactory() { return translogDeletionPolicyFactory; } + /** + * Returns subReaderSorter for org.apache.lucene.index.BaseCompositeReader. + * This gets used in lucene IndexReader and decides order of segment read. + * @return comparator + */ + public Comparator getLeafSorter() { + return this.leafSorter; + } + /** * Builder for EngineConfig class * @@ -483,6 +496,7 @@ public static class Builder { private boolean isReadOnlyReplica; private BooleanSupplier primaryModeSupplier; private TranslogFactory translogFactory = new InternalTranslogFactory(); + Comparator leafSorter; public Builder shardId(ShardId shardId) { this.shardId = shardId; @@ -614,6 +628,11 @@ public Builder translogFactory(TranslogFactory translogFactory) { return this; } + public Builder leafSorter(Comparator leafSorter) { + this.leafSorter = leafSorter; + return this; + } + public EngineConfig build() { return new EngineConfig(this); } diff --git a/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java b/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java index f5a5d50e11220..76b13ee244a2c 100644 --- a/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java +++ b/server/src/main/java/org/opensearch/index/engine/EngineConfigFactory.java @@ -10,6 +10,7 @@ import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.search.QueryCache; import org.apache.lucene.search.QueryCachingPolicy; @@ -36,6 +37,7 @@ import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Optional; import java.util.function.BooleanSupplier; @@ -151,7 +153,8 @@ public EngineConfig newEngineConfig( EngineConfig.TombstoneDocSupplier tombstoneDocSupplier, boolean isReadOnlyReplica, BooleanSupplier primaryModeSupplier, - TranslogFactory translogFactory + TranslogFactory translogFactory, + Comparator leafSorter ) { CodecService codecServiceToUse = codecService; if (codecService == null && this.codecServiceFactory != null) { @@ -184,6 +187,7 @@ public EngineConfig newEngineConfig( .readOnlyReplica(isReadOnlyReplica) .primaryModeSupplier(primaryModeSupplier) .translogFactory(translogFactory) + .leafSorter(leafSorter) .build(); } diff --git a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java index 63afc6585a99d..b96bade177be2 100644 --- a/server/src/main/java/org/opensearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/InternalEngine.java @@ -2322,6 +2322,9 @@ private IndexWriterConfig getIndexWriterConfig() { if (config().getIndexSort() != null) { iwc.setIndexSort(config().getIndexSort()); } + if (config().getLeafSorter() != null) { + iwc.setLeafSorter(config().getLeafSorter()); // The default segment search order + } return iwc; } diff --git a/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java b/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java index 5bccb4f6e827e..024f4b71584bf 100644 --- a/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java +++ b/server/src/main/java/org/opensearch/index/mapper/MappingLookup.java @@ -33,6 +33,7 @@ package org.opensearch.index.mapper; import org.apache.lucene.analysis.Analyzer; +import org.opensearch.cluster.metadata.DataStream; import org.opensearch.index.IndexSettings; import org.opensearch.index.analysis.FieldNameAnalyzer; @@ -261,6 +262,15 @@ public String getNestedScope(String path) { return null; } + /** + * If this index contains @timestamp field with Date type, it will return true + * @return true or false based on above condition + */ + public boolean containsTimeStampField() { + MappedFieldType timeSeriesFieldType = this.fieldTypeLookup.get(DataStream.TIMESERIES_FIELDNAME); + return timeSeriesFieldType != null && timeSeriesFieldType instanceof DateFieldMapper.DateFieldType; // has to be Date field type + } + private static String parentObject(String field) { int lastDot = field.lastIndexOf('.'); if (lastDot == -1) { diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index fc236d7d97a04..ed38c561c7e29 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -58,6 +58,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.ThreadInterruptedException; import org.opensearch.common.lucene.store.ByteArrayIndexInput; +import org.opensearch.cluster.metadata.DataStream; import org.opensearch.core.Assertions; import org.opensearch.ExceptionsHelper; import org.opensearch.OpenSearchException; @@ -333,6 +334,7 @@ Runnable getGlobalCheckpointSyncer() { private volatile boolean useRetentionLeasesInPeerRecovery; private final Store remoteStore; private final BiFunction translogFactorySupplier; + private final boolean isTimeSeriesIndex; private final RemoteRefreshSegmentPressureService remoteRefreshSegmentPressureService; public IndexShard( @@ -451,6 +453,9 @@ public boolean shouldCache(Query query) { this.checkpointPublisher = checkpointPublisher; this.remoteStore = remoteStore; this.translogFactorySupplier = translogFactorySupplier; + this.isTimeSeriesIndex = (mapperService == null || mapperService.documentMapper() == null) + ? false + : mapperService.documentMapper().mappers().containsTimeStampField(); this.remoteRefreshSegmentPressureService = remoteRefreshSegmentPressureService; } @@ -3627,7 +3632,9 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) thro tombstoneDocSupplier(), isReadOnlyReplica, replicationTracker::isPrimaryMode, - translogFactorySupplier.apply(indexSettings, shardRouting) + translogFactorySupplier.apply(indexSettings, shardRouting), + isTimeSeriesDescSortOptimizationEnabled() ? DataStream.TIMESERIES_LEAF_SORTER : null // DESC @timestamp default order for + // timeseries ); } @@ -3639,6 +3646,14 @@ public boolean isRemoteTranslogEnabled() { return indexSettings() != null && indexSettings().isRemoteTranslogStoreEnabled(); } + /** + * @return true if segment reverse search optimization is enabled for time series based workload. + */ + public boolean isTimeSeriesDescSortOptimizationEnabled() { + // Do not change segment order in case of index sort. + return isTimeSeriesIndex && getIndexSort() == null; + } + /** * @return True if settings indicate this shard is backed by a remote snapshot, false otherwise. */ diff --git a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java index cf21712fc912a..f02d28c2c375b 100644 --- a/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java +++ b/server/src/main/java/org/opensearch/search/internal/ContextIndexSearcher.java @@ -62,6 +62,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.CombinedBitSet; import org.apache.lucene.util.SparseFixedBitSet; +import org.opensearch.cluster.metadata.DataStream; import org.opensearch.common.lucene.search.TopDocsAndMaxScore; import org.opensearch.core.common.lease.Releasable; import org.opensearch.search.DocValueFormat; @@ -282,8 +283,17 @@ public void search( @Override protected void search(List leaves, Weight weight, Collector collector) throws IOException { - for (LeafReaderContext ctx : leaves) { // search each subreader - searchLeaf(ctx, weight, collector); + if (shouldReverseLeafReaderContexts()) { + // reverse the segment search order if this flag is true. + // Certain queries can benefit if we reverse the segment read order, + // for example time series based queries if searched for desc sort order. + for (int i = leaves.size() - 1; i >= 0; i--) { + searchLeaf(leaves.get(i), weight, collector); + } + } else { + for (int i = 0; i < leaves.size(); i++) { + searchLeaf(leaves.get(i), weight, collector); + } } } @@ -496,4 +506,24 @@ private boolean canMatchSearchAfter(LeafReaderContext ctx) throws IOException { } return true; } + + private boolean shouldReverseLeafReaderContexts() { + // Time series based workload by default traverses segments in desc order i.e. latest to the oldest order. + // This is actually beneficial for search queries to start search on latest segments first for time series workload. + // That can slow down ASC order queries on timestamp workload. So to avoid that slowdown, we will reverse leaf + // reader order here. + if (searchContext != null && searchContext.indexShard().isTimeSeriesDescSortOptimizationEnabled()) { + // Only reverse order for asc order sort queries + if (searchContext.sort() != null + && searchContext.sort().sort != null + && searchContext.sort().sort.getSort() != null + && searchContext.sort().sort.getSort().length > 0 + && searchContext.sort().sort.getSort()[0].getReverse() == false + && searchContext.sort().sort.getSort()[0].getField() != null + && searchContext.sort().sort.getSort()[0].getField().equals(DataStream.TIMESERIES_FIELDNAME)) { + return true; + } + } + return false; + } } diff --git a/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java b/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java index 2db3cd24da80d..f8bedc76ea994 100644 --- a/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java +++ b/server/src/test/java/org/opensearch/index/engine/EngineConfigFactoryTests.java @@ -69,7 +69,8 @@ public void testCreateEngineConfigFromFactory() { null, false, () -> Boolean.TRUE, - new InternalTranslogFactory() + new InternalTranslogFactory(), + null ); assertNotNull(config.getCodec()); @@ -148,7 +149,8 @@ public void testCreateCodecServiceFromFactory() { null, false, () -> Boolean.TRUE, - new InternalTranslogFactory() + new InternalTranslogFactory(), + null ); assertNotNull(config.getCodec()); } diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java index 06f74a55be13d..4474aae1f0631 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchIntegTestCase.java @@ -761,6 +761,7 @@ public Settings indexSettings() { ).getStringRep() ); } + return builder.build(); }