Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add range query support to wildcard field #57881

Merged
merged 2 commits into from
Jun 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,19 @@
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.RegExp.Kind;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.geo.ShapeRelation;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.time.DateMathParser;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
Expand Down Expand Up @@ -70,6 +74,7 @@

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
Expand Down Expand Up @@ -613,6 +618,12 @@ static Query simplify(Query input) {
static boolean isMatchAll(Query q) {
return q instanceof MatchAllDocsQuery || q instanceof MatchAllButRequireVerificationQuery;
}

protected String firstNgramToken(String fragment) {
LinkedHashSet<String> tokens = new LinkedHashSet<>();
getNgramTokens(tokens, fragment);
return tokens.iterator().next();
}

protected void getNgramTokens(Set<String> tokens, String fragment) {
if (fragment.equals(TOKEN_START_STRING) || fragment.equals(TOKEN_END_STRING)) {
Expand Down Expand Up @@ -678,6 +689,90 @@ private void addClause(String token, BooleanQuery.Builder bqBuilder, Occur occur
}
}

@Override
public Query rangeQuery(
Object lowerTerm,
Object upperTerm,
boolean includeLower,
boolean includeUpper,
ShapeRelation relation,
ZoneId timeZone,
DateMathParser parser,
QueryShardContext context
) {
if (context.allowExpensiveQueries() == false) {
throw new ElasticsearchException("[range] queries on [wildcard] fields cannot be executed when '" +
ALLOW_EXPENSIVE_QUERIES.getKey() + "' is set to false.");
}
BytesRef lower = lowerTerm == null ? null : BytesRefs.toBytesRef(lowerTerm);
BytesRef upper = upperTerm == null ? null : BytesRefs.toBytesRef(upperTerm);
Query accelerationQuery = null;
if (lowerTerm != null && upperTerm != null) {
// Long common prefixes e.g. "C:/Program Files/a,txt" to "C:/Program Files/z,txt"
// can be accelerated by searching for all the common leading ngrams e.g. c:/, /pr, rog, gra etc
StringBuilder commonPrefix = new StringBuilder();
String lowerS = addLineEndChars(toLowerCase(lower.utf8ToString()));
String upperS = addLineEndChars(toLowerCase(upper.utf8ToString()));
for (int i = 0; i < Math.min(lowerS.length(), upperS.length());) {
final int cL = lowerS.codePointAt(i);
final int cU = upperS.codePointAt(i);
if (cL == cU) {
commonPrefix.append(Character.toChars(cL));
} else {
break;
}
int length = Character.charCount(cL);
i += length;
}

if (commonPrefix.length() > 0) {
Set<String> tokens = new HashSet<>();
getNgramTokens(tokens, commonPrefix.toString());
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
for (String token : tokens) {
int tokenSize = token.codePointCount(0, token.length());
if (tokenSize < 2 || token.equals(WildcardFieldMapper.TOKEN_END_STRING)) {
continue;
}

if (tokenSize == NGRAM_SIZE) {
TermQuery tq = new TermQuery(new Term(name(), token));
bqBuilder.add(new BooleanClause(tq, Occur.MUST));
} else {
PrefixQuery wq = new PrefixQuery(new Term(name(), token));
wq.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE);
bqBuilder.add(new BooleanClause(wq, Occur.MUST));
}
}
BooleanQuery bq = bqBuilder.build();
if (bq.clauses().size() > 0) {
accelerationQuery = bq;
}
}
}
if (accelerationQuery == null) {
// Fallback - if there is no common prefix sequence then we look for the range of ngrams that appear at the start
// of the string e.g. given 100 to 999 we would search for ngrams in the range
// TOKEN_START_OR_END_CHAR + "10" to
// TOKEN_START_OR_END_CHAR + "99"
BytesRef lowerNgram = lower == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(lower.utf8ToString()))));
BytesRef upperNgram = upper == null ? null : new BytesRef(firstNgramToken(
addLineEndChars(toLowerCase(upper.utf8ToString()))));
accelerationQuery = new TermRangeQuery(name(), lowerNgram, upperNgram, true, true);
}

Supplier <Automaton> deferredAutomatonSupplier = ()->{
return TermRangeQuery.toAutomaton(lower, upper, includeLower, includeUpper);
};
AutomatonQueryOnBinaryDv slowQuery = new AutomatonQueryOnBinaryDv(name(), lower + "-" + upper, deferredAutomatonSupplier);

BooleanQuery.Builder qBuilder = new BooleanQuery.Builder();
qBuilder.add(accelerationQuery, Occur.MUST);
qBuilder.add(slowQuery, Occur.MUST);
return qBuilder.build();
}

@Override
public Query fuzzyQuery(
Object value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
Expand Down Expand Up @@ -214,7 +215,7 @@ public void testSearchResultsVersusKeywordField() throws IOException {
Query wildcardFieldQuery = null;
Query keywordFieldQuery = null;
String pattern = null;
switch (randomInt(3)) {
switch (randomInt(4)) {
case 0:
pattern = getRandomWildcardPattern();
wildcardFieldQuery = wildcardFieldType.fieldType().wildcardQuery(pattern, null, MOCK_QSC);
Expand Down Expand Up @@ -259,6 +260,14 @@ public void testSearchResultsVersusKeywordField() throws IOException {
keywordFieldQuery = keywordFieldType.fieldType().fuzzyQuery(pattern, fuzziness, prefixLength, 50,
transpositions, MOCK_QSC);
break;
case 4:
TermRangeQuery trq = getRandomRange(values);
wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
break;

}
TopDocs kwTopDocs = searcher.search(keywordFieldQuery, values.size() + 1, Sort.RELEVANCE);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, values.size() + 1, Sort.RELEVANCE);
Expand Down Expand Up @@ -294,6 +303,76 @@ public void testSearchResultsVersusKeywordField() throws IOException {
dir.close();
}

private void indexDoc(RandomIndexWriter iw, String value) throws IOException {
Document doc = new Document();
ParseContext.Document parseDoc = new ParseContext.Document();
addFields(parseDoc, doc, value);
indexDoc(parseDoc, doc, iw);
}

public void testRangeQueryVersusKeywordField() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(WildcardFieldMapper.WILDCARD_ANALYZER);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);

// Tests for acceleration strategy based on long common prefix
indexDoc(iw, "C:\\Program Files\\a.txt");
indexDoc(iw, "C:\\Program Files\\n.txt");
indexDoc(iw, "C:\\Program Files\\z.txt");

// Tests for acceleration strategy based on no common prefix
indexDoc(iw, "a.txt");
indexDoc(iw, "n.txt");
indexDoc(iw, "z.txt");

iw.forceMerge(1);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();


String [][] rangeTests = {
{"C:\\Program Files\\a", "C:\\Program Files\\z"},
{"C:\\Program Files\\a", "C:\\Program Files\\n"},
{null, "C:\\Program Files\\z"},
{"C:\\Program Files\\a", null},

{"a.txt", "z.txt"},
{"a.txt", "n.txt"},
{null, "z.txt"},
{"a.txt", null}
};

for (String[] bounds : rangeTests) {
BytesRef lower = bounds[0] == null ? null :new BytesRef(bounds[0]);
BytesRef upper = bounds[1] == null ? null :new BytesRef(bounds[1]);
TermRangeQuery trq = new TermRangeQuery(WILDCARD_FIELD_NAME, lower, upper, randomBoolean(), randomBoolean());
Query wildcardFieldQuery = wildcardFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);
Query keywordFieldQuery = keywordFieldType.fieldType().rangeQuery(trq.getLowerTerm(),trq.getUpperTerm(), trq.includesLower(),
trq.includesUpper(), null, null, null, MOCK_QSC);


TopDocs kwTopDocs = searcher.search(keywordFieldQuery, 10, Sort.RELEVANCE);
TopDocs wildcardFieldTopDocs = searcher.search(wildcardFieldQuery, 10, Sort.RELEVANCE);
assertThat(wildcardFieldTopDocs.totalHits.value, equalTo(kwTopDocs.totalHits.value));

HashSet<Integer> expectedDocs = new HashSet<>();
for (ScoreDoc topDoc : kwTopDocs.scoreDocs) {
expectedDocs.add(topDoc.doc);
}
for (ScoreDoc wcTopDoc : wildcardFieldTopDocs.scoreDocs) {
assertTrue(expectedDocs.remove(wcTopDoc.doc));
}
assertThat(expectedDocs.size(), equalTo(0));

}
reader.close();
dir.close();
}


public void testRegexAcceleration() throws IOException, ParseException {
// All these expressions should rewrite to a match all with no verification step required at all
String superfastRegexes[]= { ".*", "...*..", "(foo|bar|.*)", "@"};
Expand Down Expand Up @@ -485,6 +564,54 @@ public void testFuzzyAcceleration() throws IOException, ParseException {
}
}


static class RangeTest {
String lower;
String upper;
String ngrams;

RangeTest(
String lower,
String upper,
String ngrams
) {
super();
this.lower = lower;
this.upper = upper;
this.ngrams = ngrams;
}

Query getRangeQuery() {
return wildcardFieldType.fieldType().rangeQuery(lower, upper, true, true, null, null, null, MOCK_QSC);
}

Query getExpectedApproxQuery() throws ParseException {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
if (ngrams != null) {
String[] tokens = ngrams.split(" ");
for (String token : tokens) {
Query ngramQuery = new TermQuery(
new Term(WILDCARD_FIELD_NAME, token.replaceAll("_", WildcardFieldMapper.TOKEN_START_STRING))
);
bq.add(ngramQuery, Occur.MUST);
}
}
return bq.build();
}
}

public void testRangeAcceleration() throws IOException, ParseException {

RangeTest[] tests = {
new RangeTest("c:/a.txt", "c:/z.txt", "_c: c:/"),
new RangeTest("C:/ProgramFiles/a.txt", "C:/ProgramFiles/z.txt", "_c: :/p pro ogr ram mfi ile es/"),
markharwood marked this conversation as resolved.
Show resolved Hide resolved
};
for (RangeTest test : tests) {
Query wildcardFieldQuery = test.getRangeQuery();
testExpectedAccelerationQuery(test.lower + "-" + test.upper, wildcardFieldQuery, test.getExpectedApproxQuery());
}
}

void testExpectedAccelerationQuery(String regex, Query combinedQuery, String expectedAccelerationQueryString) throws ParseException {

QueryParser qsp = new QueryParser(WILDCARD_FIELD_NAME, new KeywordAnalyzer());
Expand Down Expand Up @@ -530,6 +657,33 @@ private String getRandomFuzzyPattern(HashSet<String> values, int edits, int pref
}
return randomValue;
}

private TermRangeQuery getRandomRange(HashSet<String> values) {
// Pick one of the indexed document values to focus our queries on.
String randomValue = values.toArray(new String[0])[randomIntBetween(0, values.size()-1)];
StringBuilder upper = new StringBuilder();
//Pick a part of the string to change
int substitutionPoint = randomIntBetween(0, randomValue.length()-1);
int substitutionLength = randomIntBetween(1, Math.min(10, randomValue.length() - substitutionPoint));

//Add any head to the result, unchanged
if(substitutionPoint >0) {
upper.append(randomValue.substring(0,substitutionPoint));
}

// Modify the middle...
String replacementPart = randomValue.substring(substitutionPoint, substitutionPoint+substitutionLength);
// .-replace all a chars with z
upper.append(replacementPart.replaceAll("a", "z"));

//add any remaining tail, unchanged
if(substitutionPoint + substitutionLength <= randomValue.length()-1) {
upper.append(randomValue.substring(substitutionPoint + substitutionLength));
}
return new TermRangeQuery(WILDCARD_FIELD_NAME, new BytesRef(randomValue), new BytesRef(upper.toString()),
randomBoolean(), randomBoolean());
}


private String getRandomRegexPattern(HashSet<String> values) {
// Pick one of the indexed document values to focus our queries on.
Expand Down