elastic · jtibshirani · Sep 6, 2022 · Aug 1, 2022 · Aug 4, 2022 · Aug 4, 2022
diff --git a/docs/changelog/89016.yaml b/docs/changelog/89016.yaml
@@ -0,0 +1,6 @@
+pr: 89016
+summary: Avoid negative scores with `cross_fields` type
+area: Ranking
+type: bug
+issues:
+ - 44700
diff --git a/docs/reference/query-dsl/multi-match-query.asciidoc b/docs/reference/query-dsl/multi-match-query.asciidoc
@@ -388,11 +388,12 @@ explanation:
 Also, accepts `analyzer`, `boost`, `operator`, `minimum_should_match`,
 `lenient` and `zero_terms_query`.
 
-WARNING: The `cross_fields` type blends field statistics in a way that does
-not always produce well-formed scores (for example scores can become
-negative). As an alternative, you can consider the
-<<query-dsl-combined-fields-query,`combined_fields`>> query, which is also
-term-centric but combines field statistics in a more robust way.
+WARNING: The `cross_fields` type blends field statistics in a complex way that
+can be hard to interpret. The score combination can even be incorrect, in
+particular when some documents contain some of the search fields, but not all
+of them. You should consider the
+<<query-dsl-combined-fields-query,`combined_fields`>> query as an alternative,
+which is also term-centric but combines field statistics in a more robust way.
 
 [[cross-field-analysis]]
 ===== `cross_field` and analysis

diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java
@@ -148,7 +148,10 @@ protected int compare(int i, int j) {
             if (prev > current) {
                 actualDf++;
             }
-            contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
+
+            int docCount = reader.getDocCount(terms[i].field());
+            int newDocFreq = Math.min(actualDf, docCount);
+            contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq);
             prev = current;
             sumTTF += ctx.totalTermFreq();
         }

diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java
@@ -248,6 +248,39 @@ public void testMinTTF() throws IOException {
         dir.close();
     }
 
+    public void testMissingFields() throws IOException {
+        Directory dir = newDirectory();
+        IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
+        FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+        ft.freeze();
+
+        for (int i = 0; i < 10; i++) {
+            Document d = new Document();
+            d.add(new TextField("id", Integer.toString(i), Field.Store.YES));
+            d.add(new Field("dense", "foo", ft));
+            // Add a sparse field with high totalTermFreq but low docCount
+            if (i % 5 == 0) {
+                d.add(new Field("sparse", "foo", ft));
+                d.add(new Field("sparse", "one two three four five size", ft));
+            }
+            w.addDocument(d);
+        }
+        w.commit();
+
+        DirectoryReader reader = DirectoryReader.open(w);
+        IndexSearcher searcher = setSimilarity(newSearcher(reader));
+
+        String[] fields = new String[] { "dense", "sparse" };
+        Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f);
+        TopDocs search = searcher.search(query, 10);
+        ScoreDoc[] scoreDocs = search.scoreDocs;
+        assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue());
+
+        reader.close();
+        w.close();
+        dir.close();
+    }
+
     public void testEqualsAndHash() {
         String[] fields = new String[1 + random().nextInt(10)];
         for (int i = 0; i < fields.length; i++) {