diff --git a/docs/changelog/89016.yaml b/docs/changelog/89016.yaml new file mode 100644 index 0000000000000..98f15e8fe1d7d --- /dev/null +++ b/docs/changelog/89016.yaml @@ -0,0 +1,6 @@ +pr: 89016 +summary: Avoid negative scores with `cross_fields` type +area: Ranking +type: bug +issues: + - 44700 diff --git a/docs/reference/query-dsl/multi-match-query.asciidoc b/docs/reference/query-dsl/multi-match-query.asciidoc index b27cdb4c548b4..a7db337004dcd 100644 --- a/docs/reference/query-dsl/multi-match-query.asciidoc +++ b/docs/reference/query-dsl/multi-match-query.asciidoc @@ -388,11 +388,12 @@ explanation: Also, accepts `analyzer`, `boost`, `operator`, `minimum_should_match`, `lenient` and `zero_terms_query`. -WARNING: The `cross_fields` type blends field statistics in a way that does -not always produce well-formed scores (for example scores can become -negative). As an alternative, you can consider the -<> query, which is also -term-centric but combines field statistics in a more robust way. +WARNING: The `cross_fields` type blends field statistics in a complex way that +can be hard to interpret. The score combination can even be incorrect, in +particular when some documents contain some of the search fields, but not all +of them. You should consider the +<> query as an alternative, +which is also term-centric but combines field statistics in a more robust way. [[cross-field-analysis]] ===== `cross_field` and analysis diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java index a57feef2f9b23..0c08f58909a01 100644 --- a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java +++ b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java @@ -148,7 +148,10 @@ protected int compare(int i, int j) { if (prev > current) { actualDf++; } - contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf)); + + int docCount = reader.getDocCount(terms[i].field()); + int newDocFreq = Math.min(actualDf, docCount); + contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq); prev = current; sumTTF += ctx.totalTermFreq(); } diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java index 3188aba723a7d..4be9b04b5bce6 100644 --- a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java +++ b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java @@ -248,6 +248,39 @@ public void testMinTTF() throws IOException { dir.close(); } + public void testMissingFields() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.freeze(); + + for (int i = 0; i < 10; i++) { + Document d = new Document(); + d.add(new TextField("id", Integer.toString(i), Field.Store.YES)); + d.add(new Field("dense", "foo", ft)); + // Add a sparse field with high totalTermFreq but low docCount + if (i % 5 == 0) { + d.add(new Field("sparse", "foo", ft)); + d.add(new Field("sparse", "one two three four five size", ft)); + } + w.addDocument(d); + } + w.commit(); + + DirectoryReader reader = DirectoryReader.open(w); + IndexSearcher searcher = setSimilarity(newSearcher(reader)); + + String[] fields = new String[] { "dense", "sparse" }; + Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f); + TopDocs search = searcher.search(query, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue()); + + reader.close(); + w.close(); + dir.close(); + } + public void testEqualsAndHash() { String[] fields = new String[1 + random().nextInt(10)]; for (int i = 0; i < fields.length; i++) {