Wildcard field - add normalizer support (#53851) (#54109)

Backport support for normalisation to wildcard field Closes #53603
elastic · Mar 24, 2020 · 6a60f85 · 6a60f85
1 parent c141c1d
commit 6a60f85
Show file tree

Hide file tree

Showing 8 changed files with 204 additions and 43 deletions.
diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc
@@ -50,6 +50,23 @@ POST my_index/_doc/_search
 --------------------------------------------------
 
 
+[[wildcard-params]]
+==== Parameters for wildcard fields
+
+The following parameters are accepted by `wildcard` fields:
+
+[horizontal]
+
+<<ignore-above,`ignore_above`>>::
+
+    Do not index any string longer than this value.  Defaults to `2147483647`
+    so that all values would be accepted.
+
+<<normalizer,`normalizer`>>::
+
+    How to pre-process the value prior to indexing. Defaults to `null`,
+    meaning the value is kept as-is.
+
 ==== Limitations
 
 * `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries.

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java
@@ -358,14 +358,14 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
     }
 
     public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) {
-        throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name
+        throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name
             + "] which is of type [" + typeName() + "]");
     }
 
     public Query wildcardQuery(String value,
                                @Nullable MultiTermQuery.RewriteMethod method,
                                QueryShardContext context) {
-        throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name
+        throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name
             + "] which is of type [" + typeName() + "]");
     }
 

diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java
@@ -19,6 +19,7 @@
 
 package org.elasticsearch.index.mapper;
 
+import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.MultiTermQuery;
@@ -93,6 +94,36 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer
         return query;
     }
 
+    public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer)  {
+        if (normalizer == null) {
+            return value;
+        }
+        // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
+        // is a char_filter that would otherwise remove them
+        Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
+        BytesRefBuilder sb = new BytesRefBuilder();
+        int last = 0;
+
+        while (wildcardMatcher.find()) {
+            if (wildcardMatcher.start() > 0) {
+                String chunk = value.substring(last, wildcardMatcher.start());
+
+                BytesRef normalized = normalizer.normalize(fieldname, chunk);
+                sb.append(normalized);
+            }
+            // append the matched group - without normalizing
+            sb.append(new BytesRef(wildcardMatcher.group()));
+
+            last = wildcardMatcher.end();
+        }
+        if (last < value.length()) {
+            String chunk = value.substring(last);
+            BytesRef normalized = normalizer.normalize(fieldname, chunk);
+            sb.append(normalized);
+        }
+        return sb.toBytesRef().utf8ToString();
+    }    
+
     @Override
     public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
         failIfNotIndexed();
@@ -103,30 +134,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, Qu
 
         Term term;
         if (searchAnalyzer() != null) {
-            // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there
-            // is a char_filter that would otherwise remove them
-            Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value);
-            BytesRefBuilder sb = new BytesRefBuilder();
-            int last = 0;
-
-            while (wildcardMatcher.find()) {
-                if (wildcardMatcher.start() > 0) {
-                    String chunk = value.substring(last, wildcardMatcher.start());
-
-                    BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
-                    sb.append(normalized);
-                }
-                // append the matched group - without normalizing
-                sb.append(new BytesRef(wildcardMatcher.group()));
-
-                last = wildcardMatcher.end();
-            }
-            if (last < value.length()) {
-                String chunk = value.substring(last);
-                BytesRef normalized = searchAnalyzer().normalize(name(), chunk);
-                sb.append(normalized);
-            }
-            term = new Term(name(), sb.toBytesRef());
+            value = normalizeWildcardPattern(name(), value, searchAnalyzer());
+            term = new Term(name(), value);
         } else {
             term = new Term(name(), indexedValueForSearch(value));
         }

diff --git a/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java b/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java
@@ -264,7 +264,7 @@ public static RangeQueryBuilder rangeQuery(String name) {
      * which matches any single character. Note this query can be slow, as it
      * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries,
      * a Wildcard term should not start with one of the wildcards {@code *} or
-     * {@code ?}.
+     * {@code ?}. (The wildcard field type however, is optimised for leading wildcards)
      *
      * @param name  The field name
      * @param query The wildcard query string

diff --git a/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java
@@ -116,7 +116,7 @@ public void testNumeric() throws Exception {
         QueryShardContext context = createShardContext();
         QueryShardException e = expectThrows(QueryShardException.class,
                 () -> query.toQuery(context));
-        assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
+        assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
                 e.getMessage());
     }
 

diff --git a/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java
@@ -816,7 +816,7 @@ public void testPrefixNumeric() throws Exception {
         QueryShardContext context = createShardContext();
         QueryShardException e = expectThrows(QueryShardException.class,
                 () -> query.toQuery(context));
-        assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]",
+        assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]",
                 e.getMessage());
         query.lenient(true);
         query.toQuery(context); // no exception

diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml
@@ -10,10 +10,20 @@ setup:
         body:
           settings:
             number_of_replicas: 0
+            analysis:
+              normalizer:
+                lowercase:
+                  type: custom
+                  char_filter: []
+                  filter: ["lowercase"]
           mappings:
             properties:
               my_wildcard:
                  type: wildcard
+                 normalizer: lowercase
+                 fields:
+                   case_sensitive:
+                     type: wildcard
   - do:
       index:
         index: test-index
@@ -26,6 +36,12 @@ setup:
         id: 2
         body:
           my_wildcard: goodbye world
+  - do:
+      index:
+        index: test-index
+        id: 3
+        body:
+          my_wildcard: cAsE iNsEnSiTiVe World
 
   - do:
       indices.refresh: {}
@@ -80,6 +96,31 @@ setup:
               my_wildcard: {value: "*ello worl*" }
 
 
+  - match: {hits.total.value: 1}
+---
+"Case insensitive query":
+  - do:
+      search:
+        body:
+          track_total_hits: true
+          query:
+            wildcard:
+              my_wildcard: {value: "*Worl*" }
+
+
+  - match: {hits.total.value: 3}
+
+---
+"Case sensitive query":
+  - do:
+      search:
+        body:
+          track_total_hits: true
+          query:
+            wildcard:
+              my_wildcard.case_sensitive: {value: "*Worl*" }
+
+
   - match: {hits.total.value: 1}
 
 ---
@@ -93,7 +134,7 @@ setup:
               my_wildcard: {value: "*ld" }
 
 
-  - match: {hits.total.value: 2}
+  - match: {hits.total.value: 3}
 
 ---
 "Long suffix query":
@@ -188,8 +229,8 @@ setup:
               terms: {field: "my_wildcard" }
 
 
-  - match: {hits.total.value: 2}
-  - length: { aggregations.top_vals.buckets: 2 }
+  - match: {hits.total.value: 3}
+  - length: { aggregations.top_vals.buckets: 3 }
 
 ---
 "Sort works":
@@ -199,20 +240,21 @@ setup:
           track_total_hits: true
           sort: [ { "my_wildcard": "desc" } ]
 
-  - match: { hits.total.value: 2 }
-  - length: { hits.hits: 2 }
+  - match: { hits.total.value: 3 }
+  - length: { hits.hits: 3 }
   - match: { hits.hits.0._id: "1" }
   - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.2._id: "3" }
 
   - do:
       search:
         body:
           track_total_hits: true
           sort: [ { "my_wildcard": "asc" } ]
 
-  - match: { hits.total.value: 2 }
-  - length: { hits.hits: 2 }
-  - match: { hits.hits.0._id: "2" }
-  - match: { hits.hits.1._id: "1" }
-
+  - match: { hits.total.value: 3 }
+  - length: { hits.hits: 3 }
+  - match: { hits.hits.0._id: "3" }
+  - match: { hits.hits.1._id: "2" }
+  - match: { hits.hits.2._id: "1" }