From b031df7e8c100ccb6e715c209547c61d4cefdb62 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 22 Sep 2022 15:01:43 -0700 Subject: [PATCH 1/6] Ensure cross_fields always uses valid term statistics --- .../lucene/queries/BlendedTermQuery.java | 10 ++- .../lucene/queries/BlendedTermQueryTests.java | 61 ++++++++++++++++++- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java index 0c08f58909a01..6fbfa96f8f04b 100644 --- a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java +++ b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java @@ -150,7 +150,15 @@ protected int compare(int i, int j) { } int docCount = reader.getDocCount(terms[i].field()); - int newDocFreq = Math.min(actualDf, docCount); + + // IMPORTANT: we make two adjustments here to ensure the new document frequency is valid: + // 1. We take a minimum with docCount, which is the total number of documents that contain + // this field. The document frequency must always be less than the document count. + // 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of + // maxDoc and minTTF. So taking the minimum ensures that the document frequency is never + // greater than the total term frequency, which would be illegal. + int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc); + contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq); prev = current; sumTTF += ctx.totalTermFreq(); diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java index 4be9b04b5bce6..01b956d3b74ba 100644 --- a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java +++ b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java @@ -37,8 +37,10 @@ import java.io.IOException; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import static org.hamcrest.Matchers.containsInAnyOrder; @@ -228,16 +230,22 @@ public void testMinTTF() throws IOException { Document d = new Document(); d.add(new TextField("id", Integer.toString(i), Field.Store.YES)); d.add(new Field("dense", "foo foo foo", ft)); - if (i % 10 == 0) { + if (i % 2 == 0) { d.add(new Field("sparse", "foo", ft)); } + if (i % 10 == 0) { + d.add(new Field("more_sparse", "foo", ft)); + } w.addDocument(d); } + w.commit(); + w.forceMerge(1); + DirectoryReader reader = DirectoryReader.open(w); IndexSearcher searcher = setSimilarity(newSearcher(reader)); { - String[] fields = new String[] { "dense", "sparse" }; + String[] fields = new String[] { "dense", "sparse", "more_sparse" }; Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f); TopDocs search = searcher.search(query, 10); ScoreDoc[] scoreDocs = search.scoreDocs; @@ -248,6 +256,55 @@ public void testMinTTF() throws IOException { dir.close(); } + public void testRandomDocuments() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.freeze(); + + Map fields = new HashMap<>(); + fields.put("field", 1.0f); + + int numRandomFields = random().nextInt(7); + for (int i = 0; i < numRandomFields; i++) { + String field = "field" + i; + float probability = randomBoolean() ? 1.0f : randomFloat(); + fields.put(field, probability); + } + + int numDocs = atLeast(100); + for (int i = 0; i < numDocs; i++) { + Document d = new Document(); + for (Map.Entry entry : fields.entrySet()) { + String field = entry.getKey(); + float probability = entry.getValue(); + if (randomFloat() < probability) { + String value = randomBoolean() ? "foo" : "foo foo foo"; + d.add(new Field(field, value, ft)); + } + if (randomFloat() < probability) { + d.add(new Field(field, "bar bar", ft)); + } + } + w.addDocument(d); + } + + w.commit(); + + DirectoryReader reader = DirectoryReader.open(w); + IndexSearcher searcher = setSimilarity(newSearcher(reader)); + { + String[] fieldNames = fields.keySet().toArray(new String[0]); + Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fieldNames, "foo"), 0.1f); + TopDocs search = searcher.search(query, 10); + assertTrue(search.totalHits.value > 0); + assertTrue(search.scoreDocs.length > 0); + } + reader.close(); + w.close(); + dir.close(); + } + public void testMissingFields() throws IOException { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); From 7a2279fd9bf0dbdde3bb2dc38a7bc3571e7431ad Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 22 Sep 2022 15:22:22 -0700 Subject: [PATCH 2/6] Update docs/changelog/90278.yaml --- docs/changelog/90278.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changelog/90278.yaml diff --git a/docs/changelog/90278.yaml b/docs/changelog/90278.yaml new file mode 100644 index 0000000000000..8604434022e01 --- /dev/null +++ b/docs/changelog/90278.yaml @@ -0,0 +1,6 @@ +pr: 90278 +summary: Ensure `cross_fields` always uses valid term statistics +area: Ranking +type: regression +issues: + - 90275 From 84e3f2f0fb6e9bcb87479f400b80b9f221779c74 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 22 Sep 2022 15:28:01 -0700 Subject: [PATCH 3/6] Rename test --- .../org/elasticsearch/lucene/queries/BlendedTermQueryTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java index 01b956d3b74ba..f1168e183514d 100644 --- a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java +++ b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java @@ -256,7 +256,7 @@ public void testMinTTF() throws IOException { dir.close(); } - public void testRandomDocuments() throws IOException { + public void testRandomFields() throws IOException { Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); From 308c9a47d7f333ba98054e67567aa4455db37fb7 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 22 Sep 2022 15:33:57 -0700 Subject: [PATCH 4/6] Fix static analysis --- .../lucene/queries/BlendedTermQuery.java | 12 ++++++------ .../lucene/queries/BlendedTermQueryTests.java | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java index 6fbfa96f8f04b..20cc5b411b63b 100644 --- a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java +++ b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java @@ -152,13 +152,13 @@ protected int compare(int i, int j) { int docCount = reader.getDocCount(terms[i].field()); // IMPORTANT: we make two adjustments here to ensure the new document frequency is valid: - // 1. We take a minimum with docCount, which is the total number of documents that contain - // this field. The document frequency must always be less than the document count. - // 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of - // maxDoc and minTTF. So taking the minimum ensures that the document frequency is never - // greater than the total term frequency, which would be illegal. + // 1. We take a minimum with docCount, which is the total number of documents that contain + // this field. The document frequency must always be less than the document count. + // 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of + // maxDoc and minTTF. So taking the minimum ensures that the document frequency is never + // greater than the total term frequency, which would be illegal. int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc); - + contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq); prev = current; sumTTF += ctx.totalTermFreq(); diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java index f1168e183514d..02a79ccba5bb2 100644 --- a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java +++ b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java @@ -268,7 +268,7 @@ public void testRandomFields() throws IOException { int numRandomFields = random().nextInt(7); for (int i = 0; i < numRandomFields; i++) { String field = "field" + i; - float probability = randomBoolean() ? 1.0f : randomFloat(); + float probability = randomBoolean() ? 1.0f : randomFloat(); fields.put(field, probability); } From 64c0eeed93a7d331b178e7e0b998cfb9d49e67d9 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 22 Sep 2022 15:41:56 -0700 Subject: [PATCH 5/6] Add a known issues entry --- docs/reference/release-notes/8.4.2.asciidoc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/reference/release-notes/8.4.2.asciidoc b/docs/reference/release-notes/8.4.2.asciidoc index e74ce8470137a..e4117d48ad78b 100644 --- a/docs/reference/release-notes/8.4.2.asciidoc +++ b/docs/reference/release-notes/8.4.2.asciidoc @@ -3,6 +3,17 @@ Also see <>. +=== Known issues + +* **This version contains a regression in `multi_match` queries that use the +`cross_fields` scoring type.** {es} ++ +When running a <> query with the +`cross_fields` type, {es} can sometimes throw an IllegalArgument exception +with the message "totalTermFreq must be at least docFreq". If you use the +`cross_fields` scoring type, it is recommended that you skip version 8.4.2. +This regression was fixed in version 8.4.3. + [[bug-8.4.2]] [float] === Bug fixes From 2eaf93616b257fdb947d9b82bdf74a450931d19d Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Thu, 22 Sep 2022 15:45:05 -0700 Subject: [PATCH 6/6] Delete docs/changelog/90278.yaml --- docs/changelog/90278.yaml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 docs/changelog/90278.yaml diff --git a/docs/changelog/90278.yaml b/docs/changelog/90278.yaml deleted file mode 100644 index 8604434022e01..0000000000000 --- a/docs/changelog/90278.yaml +++ /dev/null @@ -1,6 +0,0 @@ -pr: 90278 -summary: Ensure `cross_fields` always uses valid term statistics -area: Ranking -type: regression -issues: - - 90275