Skip to content

Commit

Permalink
SOLR-3240: add "spellcheck.collateMaxCollectDocs" for estimating coll…
Browse files Browse the repository at this point in the history
…ation hit-counts.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1479638 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
jdyer1 committed May 6, 2013
1 parent a57bc58 commit 7141cb3
Show file tree
Hide file tree
Showing 8 changed files with 345 additions and 26 deletions.
6 changes: 6 additions & 0 deletions solr/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ New Features
* SOLR-4761: Add option to plugin a merged segment warmer into solrconfig.xml
(Mark Miller, Mike McCandless, Robert Muir)

* SOLR-3240: Add "spellcheck.collateMaxCollectDocs" option so that when testing
potential Collations against the index, SpellCheckComponent will only collect
n documents, thereby estimating the hit-count. This is a performance optimization
in cases where exact hit-counts are unnecessary. Also, when "collateExtendedResults"
is false, this optimization is always made (James Dyer).

Bug Fixes
----------------------

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,20 @@ protected void addCollationsToResponse(SolrParams params, SpellingResult spellin
int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0);
int maxCollationEvaluations = params.getInt(SPELLCHECK_MAX_COLLATION_EVALUATIONS, 10000);
boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false);
int maxCollationCollectDocs = params.getInt(SPELLCHECK_COLLATE_MAX_COLLECT_DOCS, 0);
// If not reporting hits counts, don't bother collecting more than 1 document per try.
if (!collationExtendedResults) {
maxCollationCollectDocs = 1;
}
boolean shard = params.getBool(ShardParams.IS_SHARD, false);

SpellCheckCollator collator = new SpellCheckCollator();
List<SpellCheckCollation> collations = collator.collate(spellingResult, q, rb, maxCollations, maxCollationTries, maxCollationEvaluations, suggestionsMayOverlap);
SpellCheckCollator collator = new SpellCheckCollator()
.setMaxCollations(maxCollations)
.setMaxCollationTries(maxCollationTries)
.setMaxCollationEvaluations(maxCollationEvaluations)
.setSuggestionsMayOverlap(suggestionsMayOverlap)
.setDocCollectionLimit(maxCollationCollectDocs)
.setReportHits(collationExtendedResults);
List<SpellCheckCollation> collations = collator.collate(spellingResult, q, rb);
//by sorting here we guarantee a non-distributed request returns all
//results in the same order as a distributed request would,
//even in cases when the internal rank is the same.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package org.apache.solr.search;

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import java.io.IOException;

import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
/**
* <p>
* A wrapper {@link Collector} that throws {@link EarlyTerminatingCollectorException})
* once a specified maximum number of documents are collected.
* </p>
*/
public class EarlyTerminatingCollector extends Collector {
private int numCollected;
private int lastDocId = -1;
private int maxDocsToCollect;
private Collector delegate;

/**
* <p>
* Wraps a {@link Collector}, throwing {@link EarlyTerminatingCollectorException}
* once the specified maximum is reached.
* </p>
* @param delegate - the Collector to wrap.
* @param maxDocsToCollect - the maximum number of documents to Collect
*
*/
public EarlyTerminatingCollector(Collector delegate, int maxDocsToCollect) {
this.delegate = delegate;
this.maxDocsToCollect = maxDocsToCollect;
}

@Override
public boolean acceptsDocsOutOfOrder() {
return delegate.acceptsDocsOutOfOrder();
}

@Override
public void collect(int doc) throws IOException {
delegate.collect(doc);
lastDocId = doc;
numCollected++;
if(numCollected==maxDocsToCollect) {
throw new EarlyTerminatingCollectorException(numCollected, lastDocId);
}
}
@Override
public void setNextReader(AtomicReaderContext context) throws IOException {
delegate.setNextReader(context);
}
@Override
public void setScorer(Scorer scorer) throws IOException {
delegate.setScorer(scorer);
}
public int getNumCollected() {
return numCollected;
}
public void setNumCollected(int numCollected) {
this.numCollected = numCollected;
}
public int getLastDocId() {
return lastDocId;
}
public void setLastDocId(int lastDocId) {
this.lastDocId = lastDocId;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.apache.solr.search;

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Thrown by {@link EarlyTerminatingCollector} when the maximum to abort
* the scoring / collection process early, when the specified maximum number
* of documents were collected.
*/
public class EarlyTerminatingCollectorException extends RuntimeException {
private static final long serialVersionUID = 5939241340763428118L;
private int lastDocId = -1;
private int numberCollected;

public EarlyTerminatingCollectorException(int numberCollected, int lastDocId) {
this.numberCollected = numberCollected;
this.lastDocId = lastDocId;
}
public int getLastDocId() {
return lastDocId;
}
public void setLastDocId(int lastDocId) {
this.lastDocId = lastDocId;
}
public int getNumberCollected() {
return numberCollected;
}
public void setNumberCollected(int numberCollected) {
this.numberCollected = numberCollected;
}
}
23 changes: 18 additions & 5 deletions solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.spelling.QueryConverter;
import org.apache.solr.update.SolrIndexConfig;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -1235,7 +1236,8 @@ public DocList getDocList(Query query, List<Query> filterList, Sort lsort, int o
public static final int GET_DOCSET = 0x40000000;
static final int NO_CHECK_FILTERCACHE = 0x20000000;
static final int NO_SET_QCACHE = 0x10000000;

public static final int TERMINATE_EARLY = 0x04;
public static final int FORCE_INORDER_COLLECTION = 0x08;
public static final int GET_DOCLIST = 0x02; // get the documents actually returned in a response
public static final int GET_SCORES = 0x01;

Expand Down Expand Up @@ -1394,7 +1396,8 @@ private void getDocListNC(QueryResult qr,QueryCommand cmd) throws IOException {
float[] scores;

boolean needScores = (cmd.getFlags() & GET_SCORES) != 0;

boolean terminateEarly = (cmd.getFlags() & TERMINATE_EARLY) == TERMINATE_EARLY;

Query query = QueryUtils.makeQueryable(cmd.getQuery());

ProcessedFilter pf = getProcessedFilter(cmd.getFilter(), cmd.getFilterList());
Expand Down Expand Up @@ -1446,7 +1449,9 @@ public boolean acceptsDocsOutOfOrder() {
}
};
}

if (terminateEarly) {
collector = new EarlyTerminatingCollector(collector, cmd.len);
}
if( timeAllowed > 0 ) {
collector = new TimeLimitingCollector(collector, TimeLimitingCollector.getGlobalCounter(), timeAllowed);
}
Expand Down Expand Up @@ -1481,6 +1486,9 @@ public boolean acceptsDocsOutOfOrder() {
topCollector = TopFieldCollector.create(weightSort(cmd.getSort()), len, false, needScores, needScores, true);
}
Collector collector = topCollector;
if (terminateEarly) {
collector = new EarlyTerminatingCollector(collector, cmd.len);
}
if( timeAllowed > 0 ) {
collector = new TimeLimitingCollector(collector, TimeLimitingCollector.getGlobalCounter(), timeAllowed);
}
Expand Down Expand Up @@ -1529,6 +1537,7 @@ private DocSet getDocListAndSetNC(QueryResult qr,QueryCommand cmd) throws IOExce
DocSet set;

boolean needScores = (cmd.getFlags() & GET_SCORES) != 0;
boolean terminateEarly = (cmd.getFlags() & TERMINATE_EARLY) == TERMINATE_EARLY;
int maxDoc = maxDoc();
int smallSetSize = maxDoc>>6;

Expand Down Expand Up @@ -1568,7 +1577,9 @@ public boolean acceptsDocsOutOfOrder() {
}
});
}

if (terminateEarly) {
collector = new EarlyTerminatingCollector(collector, cmd.len);
}
if( timeAllowed > 0 ) {
collector = new TimeLimitingCollector(collector, TimeLimitingCollector.getGlobalCounter(), timeAllowed);
}
Expand Down Expand Up @@ -1604,7 +1615,9 @@ public boolean acceptsDocsOutOfOrder() {

DocSetCollector setCollector = new DocSetDelegateCollector(maxDoc>>6, maxDoc, topCollector);
Collector collector = setCollector;

if (terminateEarly) {
collector = new EarlyTerminatingCollector(collector, cmd.len);
}
if( timeAllowed > 0 ) {
collector = new TimeLimitingCollector(collector, TimeLimitingCollector.getGlobalCounter(), timeAllowed );
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
import java.util.List;

import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.DisMaxParams;
import org.apache.solr.common.params.GroupParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
Expand All @@ -33,15 +35,23 @@
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.EarlyTerminatingCollectorException;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SpellCheckCollator {
private static final Logger LOG = LoggerFactory.getLogger(SpellCheckCollator.class);
private int maxCollations = 1;
private int maxCollationTries = 0;
private int maxCollationEvaluations = 10000;
private boolean suggestionsMayOverlap = false;
private int docCollectionLimit = 0;
private boolean reportHits = true;

public List<SpellCheckCollation> collate(SpellingResult result, String originalQuery, ResponseBuilder ultimateResponse,
int maxCollations, int maxTries, int maxEvaluations, boolean suggestionsMayOverlap) {
List<SpellCheckCollation> collations = new ArrayList<SpellCheckCollation>();
public List<SpellCheckCollation> collate(SpellingResult result,
String originalQuery, ResponseBuilder ultimateResponse) {
List<SpellCheckCollation> collations = new ArrayList<SpellCheckCollation>();

QueryComponent queryComponent = null;
if (ultimateResponse.components != null) {
Expand All @@ -54,6 +64,7 @@ public List<SpellCheckCollation> collate(SpellingResult result, String originalQ
}

boolean verifyCandidateWithQuery = true;
int maxTries = maxCollationTries;
int maxNumberToIterate = maxTries;
if (maxTries < 1) {
maxTries = 1;
Expand All @@ -65,10 +76,17 @@ public List<SpellCheckCollation> collate(SpellingResult result, String originalQ
maxTries = 1;
verifyCandidateWithQuery = false;
}
docCollectionLimit = docCollectionLimit > 0 ? docCollectionLimit : 0;
int maxDocId = -1;
if (verifyCandidateWithQuery && docCollectionLimit > 0) {
IndexReader reader = ultimateResponse.req.getSearcher().getIndexReader();
maxDocId = reader.maxDoc();
}

int tryNo = 0;
int collNo = 0;
PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions(), maxNumberToIterate, maxEvaluations, suggestionsMayOverlap);
PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions(),
maxNumberToIterate, maxCollationEvaluations, suggestionsMayOverlap);
while (tryNo < maxTries && collNo < maxCollations && possibilityIter.hasNext()) {

PossibilityIterator.RankedSpellPossibility possibility = possibilityIter.next();
Expand Down Expand Up @@ -96,21 +114,49 @@ public List<SpellCheckCollation> collate(SpellingResult result, String originalQ
}
params.set(CommonParams.Q, collationQueryStr);
params.remove(CommonParams.START);
params.set(CommonParams.ROWS, "" + docCollectionLimit);
// we don't want any stored fields
params.set(CommonParams.FL, "id");
params.set(CommonParams.ROWS, "0");
// we'll sort by doc id to ensure no scoring is done.
params.set(CommonParams.SORT, "_docid_ asc");
// If a dismax query, don't add unnecessary clauses for scoring
params.remove(DisMaxParams.TIE);
params.remove(DisMaxParams.PF);
params.remove(DisMaxParams.PF2);
params.remove(DisMaxParams.PF3);
params.remove(DisMaxParams.BQ);
params.remove(DisMaxParams.BF);
// Collate testing does not support Grouping (see SOLR-2577)
params.remove(GroupParams.GROUP);

// creating a request here... make sure to close it!
ResponseBuilder checkResponse = new ResponseBuilder(new LocalSolrQueryRequest(ultimateResponse.req.getCore(), params),new SolrQueryResponse(), Arrays.<SearchComponent>asList(queryComponent));
ResponseBuilder checkResponse = new ResponseBuilder(
new LocalSolrQueryRequest(ultimateResponse.req.getCore(), params),
new SolrQueryResponse(), Arrays.<SearchComponent> asList(queryComponent));
checkResponse.setQparser(ultimateResponse.getQparser());
checkResponse.setFilters(ultimateResponse.getFilters());
checkResponse.setQueryString(collationQueryStr);
checkResponse.components = Arrays.<SearchComponent>asList(queryComponent);

try {
queryComponent.prepare(checkResponse);
if (docCollectionLimit > 0) {
int f = checkResponse.getFieldFlags();
checkResponse.setFieldFlags(f |= SolrIndexSearcher.TERMINATE_EARLY);
if (reportHits) {
f = checkResponse.getFieldFlags();
checkResponse.setFieldFlags(f |= SolrIndexSearcher.FORCE_INORDER_COLLECTION);
}
}
queryComponent.process(checkResponse);
hits = (Integer) checkResponse.rsp.getToLog().get("hits");
} catch (EarlyTerminatingCollectorException etce) {
assert (docCollectionLimit > 0);
if (etce.getLastDocId() + 1 == maxDocId) {
hits = docCollectionLimit;
} else {
hits = maxDocId / ((etce.getLastDocId() + 1) / docCollectionLimit);
}
} catch (Exception e) {
LOG.warn("Exception trying to re-query to check if a spell check possibility would return any hits.", e);
} finally {
Expand Down Expand Up @@ -191,6 +237,31 @@ private String getCollation(String origQuery,
offset += corr.length() - oneForReqOrProhib - (tok.endOffset() - tok.startOffset());
}
return collation.toString();
}
public SpellCheckCollator setMaxCollations(int maxCollations) {
this.maxCollations = maxCollations;
return this;
}
public SpellCheckCollator setMaxCollationTries(int maxCollationTries) {
this.maxCollationTries = maxCollationTries;
return this;
}
public SpellCheckCollator setMaxCollationEvaluations(
int maxCollationEvaluations) {
this.maxCollationEvaluations = maxCollationEvaluations;
return this;
}
public SpellCheckCollator setSuggestionsMayOverlap(
boolean suggestionsMayOverlap) {
this.suggestionsMayOverlap = suggestionsMayOverlap;
return this;
}
public SpellCheckCollator setDocCollectionLimit(int docCollectionLimit) {
this.docCollectionLimit = docCollectionLimit;
return this;
}
public SpellCheckCollator setReportHits(boolean reportHits) {
this.reportHits = reportHits;
return this;
}

}
Loading

0 comments on commit 7141cb3

Please sign in to comment.