Add conditional token filter to elasticsearch (#31958)

This allows tokenfilters to be applied selectively, depending on the status of the current token in the tokenstream. The filter takes a scripted predicate, and only applies its subfilter when the predicate returns true.
elastic · Sep 5, 2018 · 81014e0 · 81014e0
1 parent aff5658
commit 81014e0
Show file tree

Hide file tree

Showing 13 changed files with 578 additions and 1 deletion.
diff --git a/docs/painless/painless-contexts/index.asciidoc b/docs/painless/painless-contexts/index.asciidoc
@@ -30,6 +30,8 @@ include::painless-metric-agg-reduce-context.asciidoc[]
 
 include::painless-bucket-agg-context.asciidoc[]
 
+include::painless-analysis-predicate-context.asciidoc[]
+
 include::painless-watcher-condition-context.asciidoc[]
 
 include::painless-watcher-transform-context.asciidoc[]
diff --git a/docs/painless/painless-contexts/painless-analysis-predicate-context.asciidoc b/docs/painless/painless-contexts/painless-analysis-predicate-context.asciidoc
@@ -0,0 +1,43 @@
+[[painless-analysis-predicate-context]]
+=== Analysis Predicate Context
+
+Use a painless script to determine whether or not the current token in an
+analysis chain matches a predicate.
+
+*Variables*
+
+`params` (`Map`, read-only)::
+        User-defined parameters passed in as part of the query.
+
+`token.term` (`CharSequence`, read-only)::
+        The characters of the current token
+
+`token.position` (`int`, read-only)::
+        The position of the current token
+
+`token.positionIncrement` (`int`, read-only)::
+        The position increment of the current token
+
+`token.positionLength` (`int`, read-only)::
+        The position length of the current token
+
+`token.startOffset` (`int`, read-only)::
+        The start offset of the current token
+
+`token.endOffset` (`int`, read-only)::
+        The end offset of the current token
+
+`token.type` (`String`, read-only)::
+        The type of the current token
+
+`token.keyword` ('boolean`, read-only)::
+        Whether or not the current token is marked as a keyword
+
+*Return*
+
+`boolean`::
+        Whether or not the current token matches the predicate
+
+*API*
+
+The standard <<painless-api-reference, Painless API>> is available.
diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc
@@ -37,6 +37,8 @@ include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]
 
 include::tokenfilters/multiplexer-tokenfilter.asciidoc[]
 
+include::tokenfilters/condition-tokenfilter.asciidoc[]
+
 include::tokenfilters/stemmer-tokenfilter.asciidoc[]
 
 include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]

diff --git a/docs/reference/analysis/tokenfilters/condition-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/condition-tokenfilter.asciidoc
@@ -0,0 +1,90 @@
+[[analysis-condition-tokenfilter]]
+=== Conditional Token Filter
+
+The conditional token filter takes a predicate script and a list of subfilters, and
+only applies the subfilters to the current token if it matches the predicate.
+
+[float]
+=== Options
+[horizontal]
+filter:: a chain of token filters to apply to the current token if the predicate
+  matches. These can be any token filters defined elsewhere in the index mappings.
+
+script:: a predicate script that determines whether or not the filters will be applied
+  to the current token.  Note that only inline scripts are supported
+
+[float]
+=== Settings example
+
+You can set it up like:
+
+[source,js]
+--------------------------------------------------
+PUT /condition_example
+{
+    "settings" : {
+        "analysis" : {
+            "analyzer" : {
+                "my_analyzer" : {
+                    "tokenizer" : "standard",
+                    "filter" : [ "my_condition" ]
+                }
+            },
+            "filter" : {
+                "my_condition" : {
+                    "type" : "condition",
+                    "filter" : [ "lowercase" ],
+                    "script" : {
+                        "source" : "token.getTerm().length() < 5"  <1>
+                    }
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+// CONSOLE
+
+<1> This will only apply the lowercase filter to terms that are less than 5
+characters in length
+
+And test it like:
+
+[source,js]
+--------------------------------------------------
+POST /condition_example/_analyze
+{
+  "analyzer" : "my_analyzer",
+  "text" : "What Flapdoodle"
+}
+--------------------------------------------------
+// CONSOLE
+// TEST[continued]
+
+And it'd respond:
+
+[source,js]
+--------------------------------------------------
+{
+  "tokens": [
+    {
+      "token": "what",              <1>
+      "start_offset": 0,
+      "end_offset": 4,
+      "type": "<ALPHANUM>",
+      "position": 0
+    },
+    {
+      "token": "Flapdoodle",        <2>
+      "start_offset": 5,
+      "end_offset": 15,
+      "type": "<ALPHANUM>",
+      "position": 1
+    }
+  ]
+}
+--------------------------------------------------
+// TESTRESPONSE
+<1> The term `What` has been lowercased, because it is only 4 characters long
+<2> The term `Flapdoodle` has been left in its original case, because it doesn't pass
+    the predicate
diff --git a/modules/analysis-common/build.gradle b/modules/analysis-common/build.gradle
@@ -20,4 +20,13 @@
 esplugin {
     description 'Adds "built in" analyzers to Elasticsearch.'
     classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin'
+    extendedPlugins = ['lang-painless']
+}
+
+dependencies {
+    compileOnly project(':modules:lang-painless')
+}
+
+integTestCluster {
+    module project(':modules:lang-painless')
 }
diff --git a/...sis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPainlessExtension.java b/...sis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPainlessExtension.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.painless.spi.PainlessExtension;
+import org.elasticsearch.painless.spi.Whitelist;
+import org.elasticsearch.painless.spi.WhitelistLoader;
+import org.elasticsearch.script.ScriptContext;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+public class AnalysisPainlessExtension implements PainlessExtension {
+
+    private static final Whitelist WHITELIST =
+        WhitelistLoader.loadFromResourceFiles(AnalysisPainlessExtension.class, "painless_whitelist.txt");
+
+    @Override
+    public Map<ScriptContext<?>, List<Whitelist>> getContextWhitelists() {
+        return Collections.singletonMap(AnalysisPredicateScript.CONTEXT, Collections.singletonList(WHITELIST));
+    }
+}
diff --git a/...lysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java b/...lysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.elasticsearch.script.ScriptContext;
+
+/**
+ * A predicate based on the current token in a TokenStream
+ */
+public abstract class AnalysisPredicateScript {
+
+    /**
+     * Encapsulation of the state of the current token
+     */
+    public static class Token {
+        public CharSequence term;
+        public int pos;
+        public int posInc;
+        public int posLen;
+        public int startOffset;
+        public int endOffset;
+        public String type;
+        public boolean isKeyword;
+
+        public CharSequence getTerm() {
+            return term;
+        }
+
+        public int getPositionIncrement() {
+            return posInc;
+        }
+
+        public int getPosition() {
+            return pos;
+        }
+
+        public int getPositionLength() {
+            return posLen;
+        }
+
+        public int getStartOffset() {
+            return startOffset;
+        }
+
+        public int getEndOffset() {
+            return endOffset;
+        }
+
+        public String getType() {
+            return type;
+        }
+
+        public boolean isKeyword() {
+            return isKeyword;
+        }
+    }
+
+    /**
+     * Returns {@code true} if the current term matches the predicate
+     */
+    public abstract boolean execute(Token token);
+
+    public interface Factory {
+        AnalysisPredicateScript newInstance();
+    }
+
+    public static final String[] PARAMETERS = new String[]{ "token" };
+    public static final ScriptContext<Factory> CONTEXT = new ScriptContext<>("analysis", Factory.class);
+
+}
diff --git a/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -111,9 +111,16 @@
 import org.apache.lucene.analysis.tr.ApostropheFilter;
 import org.apache.lucene.analysis.tr.TurkishAnalyzer;
 import org.apache.lucene.analysis.util.ElisionFilter;
+import org.apache.lucene.util.SetOnce;
+import org.elasticsearch.client.Client;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
 import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.regex.Regex;
+import org.elasticsearch.common.xcontent.NamedXContentRegistry;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.env.NodeEnvironment;
 import org.elasticsearch.index.analysis.AnalyzerProvider;
 import org.elasticsearch.index.analysis.CharFilterFactory;
 import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory;
@@ -127,20 +134,44 @@
 import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
 import org.elasticsearch.plugins.AnalysisPlugin;
 import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.plugins.ScriptPlugin;
+import org.elasticsearch.script.ScriptContext;
+import org.elasticsearch.script.ScriptService;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.watcher.ResourceWatcherService;
 import org.tartarus.snowball.ext.DutchStemmer;
 import org.tartarus.snowball.ext.FrenchStemmer;
 
 import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
 
 import static org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings;
 
-public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
+public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, ScriptPlugin {
 
     private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(CommonAnalysisPlugin.class));
 
+    private final SetOnce<ScriptService> scriptService = new SetOnce<>();
+
+    @Override
+    public Collection<Object> createComponents(Client client, ClusterService clusterService, ThreadPool threadPool,
+                                               ResourceWatcherService resourceWatcherService, ScriptService scriptService,
+                                               NamedXContentRegistry xContentRegistry, Environment environment,
+                                               NodeEnvironment nodeEnvironment, NamedWriteableRegistry namedWriteableRegistry) {
+        this.scriptService.set(scriptService);
+        return Collections.emptyList();
+    }
+
+    @Override
+    @SuppressWarnings("rawtypes")  // TODO ScriptPlugin needs to change this to pass precommit?
+    public List<ScriptContext> getContexts() {
+        return Collections.singletonList(AnalysisPredicateScript.CONTEXT);
+    }
+
     @Override
     public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
         Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> analyzers = new TreeMap<>();
@@ -202,6 +233,8 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
         filters.put("classic", ClassicFilterFactory::new);
         filters.put("czech_stem", CzechStemTokenFilterFactory::new);
         filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new));
+        filters.put("condition",
+            requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get())));
         filters.put("decimal_digit", DecimalDigitFilterFactory::new);
         filters.put("delimited_payload_filter", LegacyDelimitedPayloadTokenFilterFactory::new);
         filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new);