From df150581217d2f4779964c75bdd461fb2ef7f0f3 Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Tue, 16 Jul 2024 13:40:27 +0200 Subject: [PATCH] Make FieldInfos dedup run for all codecs (#110913) The initial fix here didn't properly apply to all situations, lets do this a little more robust to cover all codecs. --- .../index/codec/CodecService.java | 27 ++++++++++++++++++- .../index/codec/Elasticsearch814Codec.java | 11 +------- .../elasticsearch/index/codec/CodecTests.java | 3 --- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 3ebcd1cb5b420..a8dfa3079ea27 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -9,6 +9,8 @@ package org.elasticsearch.index.codec; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.common.util.FeatureFlag; @@ -18,6 +20,7 @@ import java.util.HashMap; import java.util.Map; +import java.util.stream.Collectors; /** * Since Lucene 4.0 low level index segments are read and written through a @@ -65,7 +68,13 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) for (String codec : Codec.availableCodecs()) { codecs.put(codec, Codec.forName(codec)); } - this.codecs = Map.copyOf(codecs); + this.codecs = codecs.entrySet().stream().collect(Collectors.toUnmodifiableMap(Map.Entry::getKey, e -> { + var codec = e.getValue(); + if (codec instanceof DeduplicateFieldInfosCodec) { + return codec; + } + return new DeduplicateFieldInfosCodec(codec.getName(), codec); + })); } public Codec codec(String name) { @@ -82,4 +91,20 @@ public Codec codec(String name) { public String[] availableCodecs() { return codecs.keySet().toArray(new String[0]); } + + public static class DeduplicateFieldInfosCodec extends FilterCodec { + + private final DeduplicatingFieldInfosFormat deduplicatingFieldInfosFormat; + + protected DeduplicateFieldInfosCodec(String name, Codec delegate) { + super(name, delegate); + this.deduplicatingFieldInfosFormat = new DeduplicatingFieldInfosFormat(super.fieldInfosFormat()); + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return deduplicatingFieldInfosFormat; + } + + } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch814Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch814Codec.java index dd7a668605e57..301d3129f7c2a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch814Codec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch814Codec.java @@ -9,8 +9,6 @@ package org.elasticsearch.index.codec; import org.apache.lucene.codecs.DocValuesFormat; -import org.apache.lucene.codecs.FieldInfosFormat; -import org.apache.lucene.codecs.FilterCodec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.StoredFieldsFormat; @@ -27,12 +25,10 @@ * Elasticsearch codec as of 8.14. This extends the Lucene 9.9 codec to compressed stored fields with ZSTD instead of LZ4/DEFLATE. See * {@link Zstd814StoredFieldsFormat}. */ -public class Elasticsearch814Codec extends FilterCodec { +public class Elasticsearch814Codec extends CodecService.DeduplicateFieldInfosCodec { private final StoredFieldsFormat storedFieldsFormat; - private final FieldInfosFormat fieldInfosFormat; - private final PostingsFormat defaultPostingsFormat; private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { @Override @@ -72,7 +68,6 @@ public Elasticsearch814Codec(Zstd814StoredFieldsFormat.Mode mode) { this.defaultPostingsFormat = new Lucene99PostingsFormat(); this.defaultDVFormat = new Lucene90DocValuesFormat(); this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); - this.fieldInfosFormat = new DeduplicatingFieldInfosFormat(delegate.fieldInfosFormat()); } @Override @@ -132,8 +127,4 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return defaultKnnVectorsFormat; } - @Override - public FieldInfosFormat fieldInfosFormat() { - return fieldInfosFormat; - } } diff --git a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java index 3c687f1792d0d..2c2af49e7d062 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/CodecTests.java @@ -10,7 +10,6 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; -import org.apache.lucene.codecs.lucene99.Lucene99Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; @@ -73,7 +72,6 @@ public void testBestCompression() throws Exception { public void testLegacyDefault() throws Exception { Codec codec = createCodecService().codec("legacy_default"); - assertThat(codec, Matchers.instanceOf(Lucene99Codec.class)); assertThat(codec.storedFieldsFormat(), Matchers.instanceOf(Lucene90StoredFieldsFormat.class)); // Make sure the legacy codec is writable try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setCodec(codec))) { @@ -87,7 +85,6 @@ public void testLegacyDefault() throws Exception { public void testLegacyBestCompression() throws Exception { Codec codec = createCodecService().codec("legacy_best_compression"); - assertThat(codec, Matchers.instanceOf(Lucene99Codec.class)); assertThat(codec.storedFieldsFormat(), Matchers.instanceOf(Lucene90StoredFieldsFormat.class)); // Make sure the legacy codec is writable try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setCodec(codec))) {