Skip to content

Commit

Permalink
Wildcard field - add normalizer support (#53851) (#54109)
Browse files Browse the repository at this point in the history
Backport support for normalisation to wildcard field

Closes #53603
  • Loading branch information
markharwood authored Mar 24, 2020
1 parent c141c1d commit 6a60f85
Show file tree
Hide file tree
Showing 8 changed files with 204 additions and 43 deletions.
17 changes: 17 additions & 0 deletions docs/reference/mapping/types/wildcard.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,23 @@ POST my_index/_doc/_search
--------------------------------------------------


[[wildcard-params]]
==== Parameters for wildcard fields

The following parameters are accepted by `wildcard` fields:

[horizontal]

<<ignore-above,`ignore_above`>>::

Do not index any string longer than this value. Defaults to `2147483647`
so that all values would be accepted.

<<normalizer,`normalizer`>>::

How to pre-process the value prior to indexing. Defaults to `null`,
meaning the value is kept as-is.

==== Limitations

* `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,14 +358,14 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
}

public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}

public Query wildcardQuery(String value,
@Nullable MultiTermQuery.RewriteMethod method,
QueryShardContext context) {
throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name
throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name
+ "] which is of type [" + typeName() + "]");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery;
Expand Down Expand Up @@ -93,6 +94,36 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer
return query;
}

public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer) {
if (normalizer == null) {
return value;
}
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
// is a char_filter that would otherwise remove them
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;

while (wildcardMatcher.find()) {
if (wildcardMatcher.start() > 0) {
String chunk = value.substring(last, wildcardMatcher.start());

BytesRef normalized = normalizer.normalize(fieldname, chunk);
sb.append(normalized);
}
// append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));

last = wildcardMatcher.end();
}
if (last < value.length()) {
String chunk = value.substring(last);
BytesRef normalized = normalizer.normalize(fieldname, chunk);
sb.append(normalized);
}
return sb.toBytesRef().utf8ToString();
}

@Override
public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
failIfNotIndexed();
Expand All @@ -103,30 +134,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, Qu

Term term;
if (searchAnalyzer() != null) {
// we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
// is a char_filter that would otherwise remove them
Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
BytesRefBuilder sb = new BytesRefBuilder();
int last = 0;

while (wildcardMatcher.find()) {
if (wildcardMatcher.start() > 0) {
String chunk = value.substring(last, wildcardMatcher.start());

BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
// append the matched group - without normalizing
sb.append(new BytesRef(wildcardMatcher.group()));

last = wildcardMatcher.end();
}
if (last < value.length()) {
String chunk = value.substring(last);
BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
sb.append(normalized);
}
term = new Term(name(), sb.toBytesRef());
value = normalizeWildcardPattern(name(), value, searchAnalyzer());
term = new Term(name(), value);
} else {
term = new Term(name(), indexedValueForSearch(value));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ public static RangeQueryBuilder rangeQuery(String name) {
* which matches any single character. Note this query can be slow, as it
* needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
* a Wildcard term should not start with one of the wildcards {@code *} or
* {@code ?}.
* {@code ?}. (The wildcard field type however, is optimised for leading wildcards)
*
* @param name The field name
* @param query The wildcard query string
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ public void testNumeric() throws Exception {
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class,
() -> query.toQuery(context));
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
e.getMessage());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -816,7 +816,7 @@ public void testPrefixNumeric() throws Exception {
QueryShardContext context = createShardContext();
QueryShardException e = expectThrows(QueryShardException.class,
() -> query.toQuery(context));
assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
e.getMessage());
query.lenient(true);
query.toQuery(context); // no exception
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,20 @@ setup:
body:
settings:
number_of_replicas: 0
analysis:
normalizer:
lowercase:
type: custom
char_filter: []
filter: ["lowercase"]
mappings:
properties:
my_wildcard:
type: wildcard
normalizer: lowercase
fields:
case_sensitive:
type: wildcard
- do:
index:
index: test-index
Expand All @@ -26,6 +36,12 @@ setup:
id: 2
body:
my_wildcard: goodbye world
- do:
index:
index: test-index
id: 3
body:
my_wildcard: cAsE iNsEnSiTiVe World

- do:
indices.refresh: {}
Expand Down Expand Up @@ -80,6 +96,31 @@ setup:
my_wildcard: {value: "*ello worl*" }


- match: {hits.total.value: 1}
---
"Case insensitive query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard: {value: "*Worl*" }


- match: {hits.total.value: 3}

---
"Case sensitive query":
- do:
search:
body:
track_total_hits: true
query:
wildcard:
my_wildcard.case_sensitive: {value: "*Worl*" }


- match: {hits.total.value: 1}

---
Expand All @@ -93,7 +134,7 @@ setup:
my_wildcard: {value: "*ld" }


- match: {hits.total.value: 2}
- match: {hits.total.value: 3}

---
"Long suffix query":
Expand Down Expand Up @@ -188,8 +229,8 @@ setup:
terms: {field: "my_wildcard" }


- match: {hits.total.value: 2}
- length: { aggregations.top_vals.buckets: 2 }
- match: {hits.total.value: 3}
- length: { aggregations.top_vals.buckets: 3 }

---
"Sort works":
Expand All @@ -199,20 +240,21 @@ setup:
track_total_hits: true
sort: [ { "my_wildcard": "desc" } ]

- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "1" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "3" }

- do:
search:
body:
track_total_hits: true
sort: [ { "my_wildcard": "asc" } ]

- match: { hits.total.value: 2 }
- length: { hits.hits: 2 }
- match: { hits.hits.0._id: "2" }
- match: { hits.hits.1._id: "1" }

- match: { hits.total.value: 3 }
- length: { hits.hits: 3 }
- match: { hits.hits.0._id: "3" }
- match: { hits.hits.1._id: "2" }
- match: { hits.hits.2._id: "1" }

Loading

0 comments on commit 6a60f85

Please sign in to comment.