From 59f96a20e698da482c0116dea6604440dd8000b7 Mon Sep 17 00:00:00 2001 From: Julie Tibshirani Date: Tue, 6 Sep 2022 13:02:24 -0700 Subject: [PATCH] Avoid negative scores with cross_fields type (#89016) The cross_fields scoring type can produce negative scores when some documents are missing fields. When blending term document frequencies, we take the maximum document frequency across all fields. If one field appears in fewer documents than another, this means that its IDF can become negative. This is because IDF is calculated as `Math.log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5))` This change adjusts the docFreq for each field to `Math.min(docCount, docFreq)` so that the IDF can never become negative. It makes sense that the term document frequency should never exceed the number of documents containing the field. --- docs/changelog/89016.yaml | 6 ++++ .../query-dsl/multi-match-query.asciidoc | 11 ++++--- .../lucene/queries/BlendedTermQuery.java | 5 ++- .../lucene/queries/BlendedTermQueryTests.java | 33 +++++++++++++++++++ 4 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 docs/changelog/89016.yaml diff --git a/docs/changelog/89016.yaml b/docs/changelog/89016.yaml new file mode 100644 index 0000000000000..98f15e8fe1d7d --- /dev/null +++ b/docs/changelog/89016.yaml @@ -0,0 +1,6 @@ +pr: 89016 +summary: Avoid negative scores with `cross_fields` type +area: Ranking +type: bug +issues: + - 44700 diff --git a/docs/reference/query-dsl/multi-match-query.asciidoc b/docs/reference/query-dsl/multi-match-query.asciidoc index b27cdb4c548b4..a7db337004dcd 100644 --- a/docs/reference/query-dsl/multi-match-query.asciidoc +++ b/docs/reference/query-dsl/multi-match-query.asciidoc @@ -388,11 +388,12 @@ explanation: Also, accepts `analyzer`, `boost`, `operator`, `minimum_should_match`, `lenient` and `zero_terms_query`. -WARNING: The `cross_fields` type blends field statistics in a way that does -not always produce well-formed scores (for example scores can become -negative). As an alternative, you can consider the -<> query, which is also -term-centric but combines field statistics in a more robust way. +WARNING: The `cross_fields` type blends field statistics in a complex way that +can be hard to interpret. The score combination can even be incorrect, in +particular when some documents contain some of the search fields, but not all +of them. You should consider the +<> query as an alternative, +which is also term-centric but combines field statistics in a more robust way. [[cross-field-analysis]] ===== `cross_field` and analysis diff --git a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java index a57feef2f9b23..0c08f58909a01 100644 --- a/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java +++ b/server/src/main/java/org/elasticsearch/lucene/queries/BlendedTermQuery.java @@ -148,7 +148,10 @@ protected int compare(int i, int j) { if (prev > current) { actualDf++; } - contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf)); + + int docCount = reader.getDocCount(terms[i].field()); + int newDocFreq = Math.min(actualDf, docCount); + contexts[i] = ctx = adjustDF(reader.getContext(), ctx, newDocFreq); prev = current; sumTTF += ctx.totalTermFreq(); } diff --git a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java index 3188aba723a7d..4be9b04b5bce6 100644 --- a/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java +++ b/server/src/test/java/org/elasticsearch/lucene/queries/BlendedTermQueryTests.java @@ -248,6 +248,39 @@ public void testMinTTF() throws IOException { dir.close(); } + public void testMissingFields() throws IOException { + Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + ft.freeze(); + + for (int i = 0; i < 10; i++) { + Document d = new Document(); + d.add(new TextField("id", Integer.toString(i), Field.Store.YES)); + d.add(new Field("dense", "foo", ft)); + // Add a sparse field with high totalTermFreq but low docCount + if (i % 5 == 0) { + d.add(new Field("sparse", "foo", ft)); + d.add(new Field("sparse", "one two three four five size", ft)); + } + w.addDocument(d); + } + w.commit(); + + DirectoryReader reader = DirectoryReader.open(w); + IndexSearcher searcher = setSimilarity(newSearcher(reader)); + + String[] fields = new String[] { "dense", "sparse" }; + Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 0.1f); + TopDocs search = searcher.search(query, 10); + ScoreDoc[] scoreDocs = search.scoreDocs; + assertEquals(Integer.toString(0), reader.document(scoreDocs[0].doc).getField("id").stringValue()); + + reader.close(); + w.close(); + dir.close(); + } + public void testEqualsAndHash() { String[] fields = new String[1 + random().nextInt(10)]; for (int i = 0; i < fields.length; i++) {