-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix completion suggester's score tie-break (#34508)
The shard suggestion sort uses a different tie-break than the one that is used to merge different shards responses. The former uses the internal document identifier when scores are the same whereas the latter compares the surface form first. Because of this discrepancy some suggestion outputs are linked to the wrong documents because the merge sort reorders the shard suggestions differently. This change fixes this bug by duplicating the Lucene collector in order to be able to apply the same tiebreak strategy than the merge sort. This logic will be removed when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed. Closes #34378
- Loading branch information
Showing
3 changed files
with
275 additions
and
124 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
232 changes: 232 additions & 0 deletions
232
...c/main/java/org/elasticsearch/search/suggest/completion/TopSuggestGroupDocsCollector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
package org.elasticsearch.search.suggest.completion; | ||
|
||
import org.apache.lucene.analysis.CharArraySet; | ||
import org.apache.lucene.index.LeafReaderContext; | ||
import org.apache.lucene.search.CollectionTerminatedException; | ||
import org.apache.lucene.search.TotalHits; | ||
import org.apache.lucene.search.suggest.Lookup; | ||
import org.apache.lucene.search.suggest.document.TopSuggestDocs; | ||
import org.apache.lucene.search.suggest.document.TopSuggestDocsCollector; | ||
import org.apache.lucene.util.PriorityQueue; | ||
|
||
import java.io.IOException; | ||
import java.util.ArrayList; | ||
import java.util.Collections; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
/** | ||
* | ||
* Custom {@link TopSuggestDocsCollector} that returns top documents from the completion suggester. | ||
* <p> | ||
* TODO: this should be refactored when https://issues.apache.org/jira/browse/LUCENE-8529 is fixed. | ||
* Unlike the parent class, this collector uses the surface form to tie-break suggestions with identical | ||
* scores. | ||
* <p> | ||
* This collector groups suggestions coming from the same document but matching different contexts | ||
* or surface form together. When different contexts or surface forms match the same suggestion form only | ||
* the best one per document (sorted by weight) is kept. | ||
* <p> | ||
* This collector is also able to filter duplicate suggestion coming from different documents. | ||
* In order to keep this feature fast, the de-duplication of suggestions with different contexts is done | ||
* only on the top N*num_contexts (where N is the number of documents to return) suggestions per segment. | ||
* This means that skip_duplicates will visit at most N*num_contexts suggestions per segment to find unique suggestions | ||
* that match the input. If more than N*num_contexts suggestions are duplicated with different contexts this collector | ||
* will not be able to return more than one suggestion even when N is greater than 1. | ||
**/ | ||
class TopSuggestGroupDocsCollector extends TopSuggestDocsCollector { | ||
private final class SuggestScoreDocPriorityQueue extends PriorityQueue<TopSuggestDocs.SuggestScoreDoc> { | ||
/** | ||
* Creates a new priority queue of the specified size. | ||
*/ | ||
private SuggestScoreDocPriorityQueue(int size) { | ||
super(size); | ||
} | ||
|
||
@Override | ||
protected boolean lessThan(TopSuggestDocs.SuggestScoreDoc a, TopSuggestDocs.SuggestScoreDoc b) { | ||
if (a.score == b.score) { | ||
// tie break by completion key | ||
int cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); | ||
// prefer smaller doc id, in case of a tie | ||
return cmp != 0 ? cmp > 0 : a.doc > b.doc; | ||
} | ||
return a.score < b.score; | ||
} | ||
|
||
/** | ||
* Returns the top N results in descending order. | ||
*/ | ||
public TopSuggestDocs.SuggestScoreDoc[] getResults() { | ||
int size = size(); | ||
TopSuggestDocs.SuggestScoreDoc[] res = new TopSuggestDocs.SuggestScoreDoc[size]; | ||
for (int i = size - 1; i >= 0; i--) { | ||
res[i] = pop(); | ||
} | ||
return res; | ||
} | ||
} | ||
|
||
|
||
private final SuggestScoreDocPriorityQueue priorityQueue; | ||
private final int num; | ||
|
||
/** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */ | ||
private final List<TopSuggestDocs.SuggestScoreDoc> pendingResults; | ||
|
||
/** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */ | ||
final CharArraySet seenSurfaceForms; | ||
|
||
/** Document base offset for the current Leaf */ | ||
protected int docBase; | ||
|
||
private Map<Integer, List<CharSequence>> docContexts = new HashMap<>(); | ||
|
||
/** | ||
* Sole constructor | ||
* | ||
* Collects at most <code>num</code> completions | ||
* with corresponding document and weight | ||
*/ | ||
TopSuggestGroupDocsCollector(int num, boolean skipDuplicates) { | ||
super(1, skipDuplicates); | ||
if (num <= 0) { | ||
throw new IllegalArgumentException("'num' must be > 0"); | ||
} | ||
this.num = num; | ||
this.priorityQueue = new SuggestScoreDocPriorityQueue(num); | ||
if (skipDuplicates) { | ||
seenSurfaceForms = new CharArraySet(num, false); | ||
pendingResults = new ArrayList<>(); | ||
} else { | ||
seenSurfaceForms = null; | ||
pendingResults = null; | ||
} | ||
} | ||
|
||
/** | ||
* Returns the contexts associated with the provided <code>doc</code>. | ||
*/ | ||
public List<CharSequence> getContexts(int doc) { | ||
return docContexts.getOrDefault(doc, Collections.emptyList()); | ||
} | ||
|
||
@Override | ||
protected boolean doSkipDuplicates() { | ||
return seenSurfaceForms != null; | ||
} | ||
|
||
@Override | ||
public int getCountToCollect() { | ||
return num; | ||
} | ||
|
||
@Override | ||
protected void doSetNextReader(LeafReaderContext context) throws IOException { | ||
docBase = context.docBase; | ||
if (seenSurfaceForms != null) { | ||
seenSurfaceForms.clear(); | ||
// NOTE: this also clears the priorityQueue: | ||
for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) { | ||
pendingResults.add(hit); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException { | ||
int globalDoc = docID + docBase; | ||
boolean isDuplicate = docContexts.containsKey(globalDoc); | ||
List<CharSequence> contexts = docContexts.computeIfAbsent(globalDoc, k -> new ArrayList<>()); | ||
if (context != null) { | ||
contexts.add(context); | ||
} | ||
if (isDuplicate) { | ||
return; | ||
} | ||
TopSuggestDocs.SuggestScoreDoc current = new TopSuggestDocs.SuggestScoreDoc(globalDoc, key, context, score); | ||
if (current == priorityQueue.insertWithOverflow(current)) { | ||
// if the current SuggestScoreDoc has overflown from pq, | ||
// we can assume all of the successive collections from | ||
// this leaf will be overflown as well | ||
// TODO: reuse the overflow instance? | ||
throw new CollectionTerminatedException(); | ||
} | ||
} | ||
|
||
@Override | ||
public TopSuggestDocs get() throws IOException { | ||
|
||
TopSuggestDocs.SuggestScoreDoc[] suggestScoreDocs; | ||
|
||
if (seenSurfaceForms != null) { | ||
// NOTE: this also clears the priorityQueue: | ||
for (TopSuggestDocs.SuggestScoreDoc hit : priorityQueue.getResults()) { | ||
pendingResults.add(hit); | ||
} | ||
|
||
// Deduplicate all hits: we already dedup'd efficiently within each segment by | ||
// truncating the FST top paths search, but across segments there may still be dups: | ||
seenSurfaceForms.clear(); | ||
|
||
// TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num * | ||
// numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice: | ||
|
||
Collections.sort(pendingResults, | ||
(a, b) -> { | ||
// sort by higher score | ||
int cmp = Float.compare(b.score, a.score); | ||
if (cmp == 0) { | ||
// tie break by completion key | ||
cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); | ||
if (cmp == 0) { | ||
// prefer smaller doc id, in case of a tie | ||
cmp = Integer.compare(a.doc, b.doc); | ||
} | ||
} | ||
return cmp; | ||
}); | ||
|
||
List<TopSuggestDocs.SuggestScoreDoc> hits = new ArrayList<>(); | ||
|
||
for (TopSuggestDocs.SuggestScoreDoc hit : pendingResults) { | ||
if (seenSurfaceForms.contains(hit.key) == false) { | ||
seenSurfaceForms.add(hit.key); | ||
hits.add(hit); | ||
if (hits.size() == num) { | ||
break; | ||
} | ||
} | ||
} | ||
suggestScoreDocs = hits.toArray(new TopSuggestDocs.SuggestScoreDoc[0]); | ||
} else { | ||
suggestScoreDocs = priorityQueue.getResults(); | ||
} | ||
|
||
if (suggestScoreDocs.length > 0) { | ||
return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs); | ||
} else { | ||
return TopSuggestDocs.EMPTY; | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.