Skip to content

Commit

Permalink
Ensure cross_fields always uses valid term statistics (#90278)
Browse files Browse the repository at this point in the history
In #89016 we adjusted the `cross_fields` scoring formula to prevent negative
scores. This fix accidentally dropped another important fix that was added in
#41938. Specifically, we need to make sure to take the minimum between the
document frequency (`actualDf`) and the minimum total term frequency
(`minTTF`). Otherwise, we can produce invalid term statistics where the total
term frequency is less than the document frequency.

Fixes #90275
  • Loading branch information
jtibshirani authored Sep 23, 2022
1 parent b76b968 commit 1acc278
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 3 deletions.
11 changes: 11 additions & 0 deletions docs/reference/release-notes/8.4.2.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,17 @@

Also see <<breaking-changes-8.4,Breaking changes in 8.4>>.

=== Known issues

* **This version contains a regression in `multi_match` queries that use the
`cross_fields` scoring type.** {es}
+
When running a <<query-dsl-multi-match-query,`multi_match`>> query with the
`cross_fields` type, {es} can sometimes throw an IllegalArgument exception
with the message "totalTermFreq must be at least docFreq". If you use the
`cross_fields` scoring type, it is recommended that you skip version 8.4.2.
This regression was fixed in version 8.4.3.

[[bug-8.4.2]]
[float]
=== Bug fixes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,15 @@ protected int compare(int i, int j) {
}

int docCount = reader.getDocCount(terms[i].field());
int newDocFreq = Math.min(actualDf, docCount);

// IMPORTANT: we make two adjustments here to ensure the new document frequency is valid:
// 1. We take a minimum with docCount, which is the total number of documents that contain
// this field. The document frequency must always be less than the document count.
// 2. We also take a minimum with maxDoc. Earlier, maxDoc is adjusted to the minimum of
// maxDoc and minTTF. So taking the minimum ensures that the document frequency is never
// greater than the total term frequency, which would be illegal.
int newDocFreq = Math.min(Math.min(actualDf, docCount), maxDoc);

contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq);
prev = current;
sumTTF += ctx.totalTermFreq();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static org.hamcrest.Matchers.containsInAnyOrder;
Expand Down Expand Up @@ -228,16 +230,22 @@ public void testMinTTF() throws IOException {
Document d = new Document();
d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
d.add(new Field("dense", "foo foo foo", ft));
if (i % 10 == 0) {
if (i % 2 == 0) {
d.add(new Field("sparse", "foo", ft));
}
if (i % 10 == 0) {
d.add(new Field("more_sparse", "foo", ft));
}
w.addDocument(d);
}

w.commit();
w.forceMerge(1);

DirectoryReader reader = DirectoryReader.open(w);
IndexSearcher searcher = setSimilarity(newSearcher(reader));
{
String[] fields = new String[] { "dense", "sparse" };
String[] fields = new String[] { "dense", "sparse", "more_sparse" };
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f);
TopDocs search = searcher.search(query, 10);
ScoreDoc[] scoreDocs = search.scoreDocs;
Expand All @@ -248,6 +256,55 @@ public void testMinTTF() throws IOException {
dir.close();
}

public void testRandomFields() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
ft.freeze();

Map<String, Float> fields = new HashMap<>();
fields.put("field", 1.0f);

int numRandomFields = random().nextInt(7);
for (int i = 0; i < numRandomFields; i++) {
String field = "field" + i;
float probability = randomBoolean() ? 1.0f : randomFloat();
fields.put(field, probability);
}

int numDocs = atLeast(100);
for (int i = 0; i < numDocs; i++) {
Document d = new Document();
for (Map.Entry<String, Float> entry : fields.entrySet()) {
String field = entry.getKey();
float probability = entry.getValue();
if (randomFloat() < probability) {
String value = randomBoolean() ? "foo" : "foo foo foo";
d.add(new Field(field, value, ft));
}
if (randomFloat() < probability) {
d.add(new Field(field, "bar bar", ft));
}
}
w.addDocument(d);
}

w.commit();

DirectoryReader reader = DirectoryReader.open(w);
IndexSearcher searcher = setSimilarity(newSearcher(reader));
{
String[] fieldNames = fields.keySet().toArray(new String[0]);
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fieldNames, "foo"), 0.1f);
TopDocs search = searcher.search(query, 10);
assertTrue(search.totalHits.value > 0);
assertTrue(search.scoreDocs.length > 0);
}
reader.close();
w.close();
dir.close();
}

public void testMissingFields() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
Expand Down

0 comments on commit 1acc278

Please sign in to comment.