diff --git a/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java index 737db4a0ca2da..c7a7a54c5e93e 100644 --- a/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java +++ b/server/src/main/java/org/apache/lucene/queries/BlendedTermQuery.java @@ -152,7 +152,15 @@ protected int compare(int i, int j) { } int docCount = reader.getDocCount(terms[i].field()); - int newDocFreq = Math.min(actualDf, docCount); + + // IMPORTANT: we make two adjustments here to ensure the new document frequency is valid: + // 1. We take a minimum with docCount, which is the total number of documents that contain + // this field. The document frequency must always be less than the document count. + // 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of + // maxDoc and minTTF. So taking the minimum ensures that the document frequency is never + // greater than the total term frequency, which would be illegal. + int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc); + contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq); prev = current; sumTTF += ctx.totalTermFreq(); diff --git a/server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java index 0a28b3076c3d3..6386b9caa985c 100644 --- a/server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java +++ b/server/src/test/java/org/apache/lucene/queries/BlendedTermQueryTests.java @@ -40,8 +40,10 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import static org.hamcrest.Matchers.containsInAnyOrder; @@ -233,16 +235,22 @@ public void testMinTTF() throws IOException { Document d = new Document(); d.add(new TextField("id", Integer.toString(i), Field.Store.YES)); d.add(new Field("dense", "foo foo foo", ft)); - if (i % 10 == 0) { + if (i % 2 == 0) { d.add(new Field("sparse", "foo", ft)); } + if (i % 10 == 0) { + d.add(new Field("more_sparse", "foo", ft)); + } w.addDocument(d); } + w.commit(); + w.forceMerge(1); + DirectoryReader reader = DirectoryReader.open(w); IndexSearcher searcher = setSimilarity(newSearcher(reader)); { - String[] fields = new String[] { "dense", "sparse" }; + String[] fields = new String[] { "dense", "sparse", "more_sparse" }; Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f); TopDocs search = searcher.search(query, 10); ScoreDoc[] scoreDocs = search.scoreDocs; @@ -253,6 +261,55 @@ public void testMinTTF() throws IOException { dir.close(); } + public void testRandomFields() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.freeze(); + + Map fields = new HashMap<>(); + fields.put("field", 1.0f); + + int numRandomFields = random().nextInt(7); + for (int i = 0; i < numRandomFields; i++) { + String field = "field" + i; + float probability = randomBoolean() ? 1.0f : randomFloat(); + fields.put(field, probability); + } + + int numDocs = atLeast(100); + for (int i = 0; i < numDocs; i++) { + Document d = new Document(); + for (Map.Entry entry : fields.entrySet()) { + String field = entry.getKey(); + float probability = entry.getValue(); + if (randomFloat() < probability) { + String value = randomBoolean() ? "foo" : "foo foo foo"; + d.add(new Field(field, value, ft)); + } + if (randomFloat() < probability) { + d.add(new Field(field, "bar bar", ft)); + } + } + w.addDocument(d); + } + + w.commit(); + + DirectoryReader reader = DirectoryReader.open(w); + IndexSearcher searcher = setSimilarity(newSearcher(reader)); + { + String[] fieldNames = fields.keySet().toArray(new String[0]); + Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fieldNames, "foo"), 0.1f); + TopDocs search = searcher.search(query, 10); + assertTrue(search.totalHits.value > 0); + assertTrue(search.scoreDocs.length > 0); + } + reader.close(); + w.close(); + dir.close(); + } + public void testMissingFields() throws IOException { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));