Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Fuzzy codec for doc id fields using a bloom filter #11022

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- [Remote cluster state] Make index and global metadata upload timeout dynamic cluster settings ([#10814](https://github.com/opensearch-project/OpenSearch/pull/10814))
- Added cluster setting cluster.restrict.index.replication_type to restrict setting of index setting replication type ([#10866](https://github.com/opensearch-project/OpenSearch/pull/10866))
- Add cluster state stats ([#10670](https://github.com/opensearch-project/OpenSearch/pull/10670))
- Enable Fuzzy codec for doc id fields using a bloom filter ([#11022](https://github.com/opensearch-project/OpenSearch/pull/11022))

### Dependencies
- Bump `com.google.api.grpc:proto-google-common-protos` from 2.10.0 to 2.25.1 ([#10208](https://github.com/opensearch-project/OpenSearch/pull/10208), [#10298](https://github.com/opensearch-project/OpenSearch/pull/10298))
Expand Down Expand Up @@ -141,4 +142,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Security

[Unreleased 3.0]: https://github.com/opensearch-project/OpenSearch/compare/2.x...HEAD
[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.12...2.x
[Unreleased 2.x]: https://github.com/opensearch-project/OpenSearch/compare/2.12...2.x
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.benchmark.index.codec.fuzzy;

import org.apache.lucene.util.BytesRef;
import org.opensearch.common.UUIDs;
import org.opensearch.index.codec.fuzzy.FuzzySet;
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
import org.opensearch.index.mapper.IdFieldMapper;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

@Fork(3)
@Warmup(iterations = 2)
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
public class FilterConstructionBenchmark {

private List<BytesRef> items;

@Param({ "1000000", "10000000", "50000000" })
private int numIds;

@Param({ "0.0511", "0.1023", "0.2047" })
private double fpp;

private FuzzySetFactory fuzzySetFactory;
private String fieldName;

@Setup
public void setupIds() {
this.fieldName = IdFieldMapper.NAME;
this.items = IntStream.range(0, numIds).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
this.fuzzySetFactory = new FuzzySetFactory(Map.of(fieldName, parameters));
}

@Benchmark
public FuzzySet buildFilter() throws IOException {
return fuzzySetFactory.createFuzzySet(items.size(), fieldName, () -> items.iterator());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.benchmark.index.codec.fuzzy;

import org.apache.lucene.util.BytesRef;
import org.opensearch.common.UUIDs;
import org.opensearch.index.codec.fuzzy.FuzzySet;
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
import org.opensearch.index.mapper.IdFieldMapper;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;
import org.openjdk.jmh.infra.Blackhole;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

@Fork(3)
@Warmup(iterations = 2)
@Measurement(iterations = 5, time = 60, timeUnit = TimeUnit.SECONDS)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
public class FilterLookupBenchmark {

@Param({ "50000000", "1000000" })
private int numItems;

@Param({ "1000000" })
private int searchKeyCount;

@Param({ "0.0511", "0.1023", "0.2047" })
private double fpp;

private FuzzySet fuzzySet;
private List<BytesRef> items;
private Random random = new Random();

@Setup
public void setupFilter() throws IOException {
String fieldName = IdFieldMapper.NAME;
items = IntStream.range(0, numItems).mapToObj(i -> new BytesRef(UUIDs.base64UUID())).collect(Collectors.toList());
FuzzySetParameters parameters = new FuzzySetParameters(() -> fpp);
fuzzySet = new FuzzySetFactory(Map.of(fieldName, parameters)).createFuzzySet(numItems, fieldName, () -> items.iterator());
}

@Benchmark
public void contains_withExistingKeys(Blackhole blackhole) throws IOException {
for (int i = 0; i < searchKeyCount; i++) {
blackhole.consume(fuzzySet.contains(items.get(random.nextInt(items.size()))) == FuzzySet.Result.MAYBE);
}
}

@Benchmark
public void contains_withRandomKeys(Blackhole blackhole) throws IOException {
for (int i = 0; i < searchKeyCount; i++) {
blackhole.consume(fuzzySet.contains(new BytesRef(UUIDs.base64UUID())));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,9 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
IndexMetadata.INDEX_REMOTE_SEGMENT_STORE_REPOSITORY_SETTING,
IndexMetadata.INDEX_REMOTE_TRANSLOG_REPOSITORY_SETTING,

IndexSettings.INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING,
IndexSettings.INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING,

// validate that built-in similarities don't get redefined
Setting.groupSetting("index.similarity.", (s) -> {
Map<String, Settings> groups = s.getAsGroups();
Expand Down
52 changes: 52 additions & 0 deletions server/src/main/java/org/opensearch/index/IndexSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@

import static org.opensearch.Version.V_2_7_0;
import static org.opensearch.common.util.FeatureFlags.SEARCHABLE_SNAPSHOT_EXTENDED_COMPATIBILITY;
import static org.opensearch.index.codec.fuzzy.FuzzySetParameters.DEFAULT_FALSE_POSITIVE_PROBABILITY;
import static org.opensearch.index.mapper.MapperService.INDEX_MAPPING_DEPTH_LIMIT_SETTING;
import static org.opensearch.index.mapper.MapperService.INDEX_MAPPING_FIELD_NAME_LENGTH_LIMIT_SETTING;
import static org.opensearch.index.mapper.MapperService.INDEX_MAPPING_NESTED_DOCS_LIMIT_SETTING;
Expand Down Expand Up @@ -658,6 +659,22 @@ public static IndexMergePolicy fromString(String text) {
Property.Dynamic
);

public static final Setting<Boolean> INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING = Setting.boolSetting(
"index.doc_id_fuzzy_set.enabled",
false,
Property.IndexScope,
Property.Dynamic
);

public static final Setting<Double> INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING = Setting.doubleSetting(
"index.doc_id_fuzzy_set.false_positive_probability",
DEFAULT_FALSE_POSITIVE_PROBABILITY,
0.01,
0.50,
Property.IndexScope,
Property.Dynamic
);

public static final TimeValue DEFAULT_REMOTE_TRANSLOG_BUFFER_INTERVAL = new TimeValue(650, TimeUnit.MILLISECONDS);
public static final TimeValue MINIMUM_REMOTE_TRANSLOG_BUFFER_INTERVAL = TimeValue.ZERO;
public static final Setting<TimeValue> INDEX_REMOTE_TRANSLOG_BUFFER_INTERVAL_SETTING = Setting.timeSetting(
Expand Down Expand Up @@ -787,6 +804,16 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) {
*/
private volatile UnaryOperator<MergePolicy> mergeOnFlushPolicy;

/**
* Is fuzzy set enabled for doc id
*/
private volatile boolean enableFuzzySetForDocId;

/**
* False positive probability to use while creating fuzzy set.
*/
private volatile double docIdFuzzySetFalsePositiveProbability;

/**
* Returns the default search fields for this index.
*/
Expand Down Expand Up @@ -926,6 +953,10 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
* Now this sortField (IndexSort) is stored in SegmentInfo and we need to maintain backward compatibility for them.
*/
widenIndexSortType = IndexMetadata.SETTING_INDEX_VERSION_CREATED.get(settings).before(V_2_7_0);

enableFuzzySetForDocId = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING);
docIdFuzzySetFalsePositiveProbability = scopedSettings.get(INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING);

scopedSettings.addSettingsUpdateConsumer(
TieredMergePolicyProvider.INDEX_COMPOUND_FORMAT_SETTING,
tieredMergePolicyProvider::setNoCFSRatio
Expand Down Expand Up @@ -1032,6 +1063,11 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti
this::setRemoteTranslogUploadBufferInterval
);
scopedSettings.addSettingsUpdateConsumer(INDEX_REMOTE_TRANSLOG_KEEP_EXTRA_GEN_SETTING, this::setRemoteTranslogKeepExtraGen);
scopedSettings.addSettingsUpdateConsumer(INDEX_DOC_ID_FUZZY_SET_ENABLED_SETTING, this::setEnableFuzzySetForDocId);
scopedSettings.addSettingsUpdateConsumer(
INDEX_DOC_ID_FUZZY_SET_FALSE_POSITIVE_PROBABILITY_SETTING,
this::setDocIdFuzzySetFalsePositiveProbability
);
}

private void setSearchIdleAfter(TimeValue searchIdleAfter) {
Expand Down Expand Up @@ -1801,4 +1837,20 @@ public void setDefaultSearchPipeline(String defaultSearchPipeline) {
public boolean shouldWidenIndexSortType() {
return this.widenIndexSortType;
}

public boolean isEnableFuzzySetForDocId() {
return enableFuzzySetForDocId;
}

public void setEnableFuzzySetForDocId(boolean enableFuzzySetForDocId) {
this.enableFuzzySetForDocId = enableFuzzySetForDocId;
}

public double getDocIdFuzzySetFalsePositiveProbability() {
return docIdFuzzySetFalsePositiveProbability;
}

public void setDocIdFuzzySetFalsePositiveProbability(double docIdFuzzySetFalsePositiveProbability) {
this.docIdFuzzySetFalsePositiveProbability = docIdFuzzySetFalsePositiveProbability;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,16 @@
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene95.Lucene95Codec;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.index.codec.fuzzy.FuzzyFilterPostingsFormat;
import org.opensearch.index.codec.fuzzy.FuzzySetFactory;
import org.opensearch.index.codec.fuzzy.FuzzySetParameters;
import org.opensearch.index.mapper.CompletionFieldMapper;
import org.opensearch.index.mapper.IdFieldMapper;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.index.mapper.MapperService;

import java.util.Map;

/**
* {@link PerFieldMappingPostingFormatCodec This postings format} is the default
* {@link PostingsFormat} for OpenSearch. It utilizes the
Expand All @@ -57,6 +63,8 @@ public class PerFieldMappingPostingFormatCodec extends Lucene95Codec {
private final Logger logger;
private final MapperService mapperService;
private final DocValuesFormat dvFormat = new Lucene90DocValuesFormat();
private FuzzySetFactory fuzzySetFactory;
private FuzzyFilterPostingsFormat docIdPostingsFormat;

static {
assert Codec.forName(Lucene.LATEST_CODEC).getClass().isAssignableFrom(PerFieldMappingPostingFormatCodec.class)
Expand All @@ -67,6 +75,12 @@ public PerFieldMappingPostingFormatCodec(Mode compressionMode, MapperService map
super(compressionMode);
this.mapperService = mapperService;
this.logger = logger;
fuzzySetFactory = new FuzzySetFactory(
Map.of(
IdFieldMapper.NAME,
new FuzzySetParameters(() -> mapperService.getIndexSettings().getDocIdFuzzySetFalsePositiveProbability())
)
);
}

@Override
Expand All @@ -76,6 +90,11 @@ public PostingsFormat getPostingsFormatForField(String field) {
logger.warn("no index mapper found for field: [{}] returning default postings format", field);
} else if (fieldType instanceof CompletionFieldMapper.CompletionFieldType) {
return CompletionFieldMapper.CompletionFieldType.postingsFormat();
} else if (IdFieldMapper.NAME.equals(field) && mapperService.getIndexSettings().isEnableFuzzySetForDocId()) {
if (docIdPostingsFormat == null) {
docIdPostingsFormat = new FuzzyFilterPostingsFormat(super.getPostingsFormatForField(field), fuzzySetFactory);
}
return docIdPostingsFormat;
}
return super.getPostingsFormatForField(field);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.index.codec.fuzzy;

import org.apache.lucene.util.BytesRef;
import org.opensearch.common.CheckedSupplier;

import java.io.IOException;
import java.util.Iterator;

/**
* Encapsulates common behaviour implementation for a fuzzy set.
*/
public abstract class AbstractFuzzySet implements FuzzySet {

/**
* Add an item to this fuzzy set.
* @param value The value to be added
*/
protected abstract void add(BytesRef value);

/**
* Add all items to the underlying set.
* Implementations can choose to perform this using an optimized strategy based on the type of set.
* @param valuesIteratorProvider Supplier for an iterator over All values which should be added to the set.
*/
protected void addAll(CheckedSupplier<Iterator<BytesRef>, IOException> valuesIteratorProvider) throws IOException {
Iterator<BytesRef> values = valuesIteratorProvider.get();
while (values.hasNext()) {
add(values.next());
}
}

protected long generateKey(BytesRef value) {
return MurmurHash64.INSTANCE.hash(value);
}

protected void assertAllElementsExist(CheckedSupplier<Iterator<BytesRef>, IOException> iteratorProvider) throws IOException {
Iterator<BytesRef> iter = iteratorProvider.get();
int cnt = 0;
while (iter.hasNext()) {
BytesRef item = iter.next();
assert contains(item) == Result.MAYBE
: "Expected Filter to return positive response for elements added to it. Elements matched: " + cnt;
cnt++;
}
}
}
Loading
Loading