From 636442700c02bdf929cad6dd47f18a84635309c4 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 5 Sep 2018 14:52:43 +0100 Subject: [PATCH] Add conditional token filter to elasticsearch (#31958) This allows tokenfilters to be applied selectively, depending on the status of the current token in the tokenstream. The filter takes a scripted predicate, and only applies its subfilter when the predicate returns true. --- .../painless/painless-contexts/index.asciidoc | 2 + ...inless-analysis-predicate-context.asciidoc | 43 +++++++ docs/reference/analysis/tokenfilters.asciidoc | 2 + .../condition-tokenfilter.asciidoc | 90 ++++++++++++++ modules/analysis-common/build.gradle | 9 ++ .../common/AnalysisPainlessExtension.java | 40 ++++++ .../common/AnalysisPredicateScript.java | 87 +++++++++++++ .../analysis/common/CommonAnalysisPlugin.java | 35 +++++- .../ScriptedConditionTokenFilterFactory.java | 117 ++++++++++++++++++ ...asticsearch.painless.spi.PainlessExtension | 1 + .../analysis/common/painless_whitelist.txt | 28 +++++ .../ScriptedConditionTokenFilterTests.java | 89 +++++++++++++ .../analysis-common/60_analysis_scripting.yml | 36 ++++++ 13 files changed, 578 insertions(+), 1 deletion(-) create mode 100644 docs/painless/painless-contexts/painless-analysis-predicate-context.asciidoc create mode 100644 docs/reference/analysis/tokenfilters/condition-tokenfilter.asciidoc create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPainlessExtension.java create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java create mode 100644 modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java create mode 100644 modules/analysis-common/src/main/resources/META-INF/services/org.elasticsearch.painless.spi.PainlessExtension create mode 100644 modules/analysis-common/src/main/resources/org/elasticsearch/analysis/common/painless_whitelist.txt create mode 100644 modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterTests.java create mode 100644 modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml diff --git a/docs/painless/painless-contexts/index.asciidoc b/docs/painless/painless-contexts/index.asciidoc index a9d3982133e1b..a71fde0be32a0 100644 --- a/docs/painless/painless-contexts/index.asciidoc +++ b/docs/painless/painless-contexts/index.asciidoc @@ -30,6 +30,8 @@ include::painless-metric-agg-reduce-context.asciidoc[] include::painless-bucket-agg-context.asciidoc[] +include::painless-analysis-predicate-context.asciidoc[] + include::painless-watcher-condition-context.asciidoc[] include::painless-watcher-transform-context.asciidoc[] diff --git a/docs/painless/painless-contexts/painless-analysis-predicate-context.asciidoc b/docs/painless/painless-contexts/painless-analysis-predicate-context.asciidoc new file mode 100644 index 0000000000000..07914b671e781 --- /dev/null +++ b/docs/painless/painless-contexts/painless-analysis-predicate-context.asciidoc @@ -0,0 +1,43 @@ +[[painless-analysis-predicate-context]] +=== Analysis Predicate Context + +Use a painless script to determine whether or not the current token in an +analysis chain matches a predicate. + +*Variables* + +`params` (`Map`, read-only):: + User-defined parameters passed in as part of the query. + +`token.term` (`CharSequence`, read-only):: + The characters of the current token + +`token.position` (`int`, read-only):: + The position of the current token + +`token.positionIncrement` (`int`, read-only):: + The position increment of the current token + +`token.positionLength` (`int`, read-only):: + The position length of the current token + +`token.startOffset` (`int`, read-only):: + The start offset of the current token + +`token.endOffset` (`int`, read-only):: + The end offset of the current token + +`token.type` (`String`, read-only):: + The type of the current token + +`token.keyword` ('boolean`, read-only):: + Whether or not the current token is marked as a keyword + +*Return* + +`boolean`:: + Whether or not the current token matches the predicate + +*API* + +The standard <> is available. \ No newline at end of file diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index ee891fdd09aa7..5899744247899 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -37,6 +37,8 @@ include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[] include::tokenfilters/multiplexer-tokenfilter.asciidoc[] +include::tokenfilters/condition-tokenfilter.asciidoc[] + include::tokenfilters/stemmer-tokenfilter.asciidoc[] include::tokenfilters/stemmer-override-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/condition-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/condition-tokenfilter.asciidoc new file mode 100644 index 0000000000000..cff05559ab9e6 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/condition-tokenfilter.asciidoc @@ -0,0 +1,90 @@ +[[analysis-condition-tokenfilter]] +=== Conditional Token Filter + +The conditional token filter takes a predicate script and a list of subfilters, and +only applies the subfilters to the current token if it matches the predicate. + +[float] +=== Options +[horizontal] +filter:: a chain of token filters to apply to the current token if the predicate + matches. These can be any token filters defined elsewhere in the index mappings. + +script:: a predicate script that determines whether or not the filters will be applied + to the current token. Note that only inline scripts are supported + +[float] +=== Settings example + +You can set it up like: + +[source,js] +-------------------------------------------------- +PUT /condition_example +{ + "settings" : { + "analysis" : { + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "standard", + "filter" : [ "my_condition" ] + } + }, + "filter" : { + "my_condition" : { + "type" : "condition", + "filter" : [ "lowercase" ], + "script" : { + "source" : "token.getTerm().length() < 5" <1> + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<1> This will only apply the lowercase filter to terms that are less than 5 +characters in length + +And test it like: + +[source,js] +-------------------------------------------------- +POST /condition_example/_analyze +{ + "analyzer" : "my_analyzer", + "text" : "What Flapdoodle" +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +And it'd respond: + +[source,js] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "what", <1> + "start_offset": 0, + "end_offset": 4, + "type": "", + "position": 0 + }, + { + "token": "Flapdoodle", <2> + "start_offset": 5, + "end_offset": 15, + "type": "", + "position": 1 + } + ] +} +-------------------------------------------------- +// TESTRESPONSE +<1> The term `What` has been lowercased, because it is only 4 characters long +<2> The term `Flapdoodle` has been left in its original case, because it doesn't pass + the predicate \ No newline at end of file diff --git a/modules/analysis-common/build.gradle b/modules/analysis-common/build.gradle index 391b74934c97d..e5193ab3c8451 100644 --- a/modules/analysis-common/build.gradle +++ b/modules/analysis-common/build.gradle @@ -20,4 +20,13 @@ esplugin { description 'Adds "built in" analyzers to Elasticsearch.' classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin' + extendedPlugins = ['lang-painless'] +} + +dependencies { + compileOnly project(':modules:lang-painless') +} + +integTestCluster { + module project(':modules:lang-painless') } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPainlessExtension.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPainlessExtension.java new file mode 100644 index 0000000000000..85abec4ce915c --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPainlessExtension.java @@ -0,0 +1,40 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.painless.spi.PainlessExtension; +import org.elasticsearch.painless.spi.Whitelist; +import org.elasticsearch.painless.spi.WhitelistLoader; +import org.elasticsearch.script.ScriptContext; + +import java.util.Collections; +import java.util.List; +import java.util.Map; + +public class AnalysisPainlessExtension implements PainlessExtension { + + private static final Whitelist WHITELIST = + WhitelistLoader.loadFromResourceFiles(AnalysisPainlessExtension.class, "painless_whitelist.txt"); + + @Override + public Map, List> getContextWhitelists() { + return Collections.singletonMap(AnalysisPredicateScript.CONTEXT, Collections.singletonList(WHITELIST)); + } +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java new file mode 100644 index 0000000000000..7de588a958c77 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java @@ -0,0 +1,87 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.script.ScriptContext; + +/** + * A predicate based on the current token in a TokenStream + */ +public abstract class AnalysisPredicateScript { + + /** + * Encapsulation of the state of the current token + */ + public static class Token { + public CharSequence term; + public int pos; + public int posInc; + public int posLen; + public int startOffset; + public int endOffset; + public String type; + public boolean isKeyword; + + public CharSequence getTerm() { + return term; + } + + public int getPositionIncrement() { + return posInc; + } + + public int getPosition() { + return pos; + } + + public int getPositionLength() { + return posLen; + } + + public int getStartOffset() { + return startOffset; + } + + public int getEndOffset() { + return endOffset; + } + + public String getType() { + return type; + } + + public boolean isKeyword() { + return isKeyword; + } + } + + /** + * Returns {@code true} if the current term matches the predicate + */ + public abstract boolean execute(Token token); + + public interface Factory { + AnalysisPredicateScript newInstance(); + } + + public static final String[] PARAMETERS = new String[]{ "token" }; + public static final ScriptContext CONTEXT = new ScriptContext<>("analysis", Factory.class); + +} diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index d95af920a307b..bbd721169c6c7 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -111,9 +111,16 @@ import org.apache.lucene.analysis.tr.ApostropheFilter; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.util.ElisionFilter; +import org.apache.lucene.util.SetOnce; +import org.elasticsearch.client.Client; +import org.elasticsearch.cluster.service.ClusterService; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.PreBuiltAnalyzerProviderFactory; @@ -127,20 +134,44 @@ import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.plugins.ScriptPlugin; +import org.elasticsearch.script.ScriptContext; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.threadpool.ThreadPool; +import org.elasticsearch.watcher.ResourceWatcherService; import org.tartarus.snowball.ext.DutchStemmer; import org.tartarus.snowball.ext.FrenchStemmer; import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.TreeMap; import static org.elasticsearch.plugins.AnalysisPlugin.requiresAnalysisSettings; -public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { +public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin, ScriptPlugin { private static final DeprecationLogger DEPRECATION_LOGGER = new DeprecationLogger(Loggers.getLogger(CommonAnalysisPlugin.class)); + private final SetOnce scriptService = new SetOnce<>(); + + @Override + public Collection createComponents(Client client, ClusterService clusterService, ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, ScriptService scriptService, + NamedXContentRegistry xContentRegistry, Environment environment, + NodeEnvironment nodeEnvironment, NamedWriteableRegistry namedWriteableRegistry) { + this.scriptService.set(scriptService); + return Collections.emptyList(); + } + + @Override + @SuppressWarnings("rawtypes") // TODO ScriptPlugin needs to change this to pass precommit? + public List getContexts() { + return Collections.singletonList(AnalysisPredicateScript.CONTEXT); + } + @Override public Map>> getAnalyzers() { Map>> analyzers = new TreeMap<>(); @@ -202,6 +233,8 @@ public Map> getTokenFilters() { filters.put("classic", ClassicFilterFactory::new); filters.put("czech_stem", CzechStemTokenFilterFactory::new); filters.put("common_grams", requiresAnalysisSettings(CommonGramsTokenFilterFactory::new)); + filters.put("condition", + requiresAnalysisSettings((i, e, n, s) -> new ScriptedConditionTokenFilterFactory(i, n, s, scriptService.get()))); filters.put("decimal_digit", DecimalDigitFilterFactory::new); filters.put("delimited_payload_filter", LegacyDelimitedPayloadTokenFilterFactory::new); filters.put("delimited_payload", DelimitedPayloadTokenFilterFactory::new); diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java new file mode 100644 index 0000000000000..cf7fd5b047a89 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterFactory.java @@ -0,0 +1,117 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.ReferringFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.script.ScriptType; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.Function; + +/** + * A factory for a conditional token filter that only applies child filters if the underlying token + * matches an {@link AnalysisPredicateScript} + */ +public class ScriptedConditionTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory { + + private final AnalysisPredicateScript.Factory factory; + private final List filters = new ArrayList<>(); + private final List filterNames; + + ScriptedConditionTokenFilterFactory(IndexSettings indexSettings, String name, + Settings settings, ScriptService scriptService) { + super(indexSettings, name, settings); + + Settings scriptSettings = settings.getAsSettings("script"); + Script script = Script.parse(scriptSettings); + if (script.getType() != ScriptType.INLINE) { + throw new IllegalArgumentException("Cannot use stored scripts in tokenfilter [" + name + "]"); + } + this.factory = scriptService.compile(script, AnalysisPredicateScript.CONTEXT); + + this.filterNames = settings.getAsList("filter"); + if (this.filterNames.isEmpty()) { + throw new IllegalArgumentException("Empty list of filters provided to tokenfilter [" + name + "]"); + } + } + + @Override + public TokenStream create(TokenStream tokenStream) { + Function filter = in -> { + for (TokenFilterFactory tff : filters) { + in = tff.create(in); + } + return in; + }; + AnalysisPredicateScript script = factory.newInstance(); + final AnalysisPredicateScript.Token token = new AnalysisPredicateScript.Token(); + return new ConditionalTokenFilter(tokenStream, filter) { + + CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + + @Override + protected boolean shouldFilter() { + token.term = termAtt; + token.posInc = posIncAtt.getPositionIncrement(); + token.pos += token.posInc; + token.posLen = posLenAtt.getPositionLength(); + token.startOffset = offsetAtt.startOffset(); + token.endOffset = offsetAtt.endOffset(); + token.type = typeAtt.type(); + token.isKeyword = keywordAtt.isKeyword(); + return script.execute(token); + } + }; + } + + @Override + public void setReferences(Map factories) { + for (String filter : filterNames) { + TokenFilterFactory tff = factories.get(filter); + if (tff == null) { + throw new IllegalArgumentException("ScriptedConditionTokenFilter [" + name() + + "] refers to undefined token filter [" + filter + "]"); + } + filters.add(tff); + } + } + +} diff --git a/modules/analysis-common/src/main/resources/META-INF/services/org.elasticsearch.painless.spi.PainlessExtension b/modules/analysis-common/src/main/resources/META-INF/services/org.elasticsearch.painless.spi.PainlessExtension new file mode 100644 index 0000000000000..44e98a3dd9c68 --- /dev/null +++ b/modules/analysis-common/src/main/resources/META-INF/services/org.elasticsearch.painless.spi.PainlessExtension @@ -0,0 +1 @@ +org.elasticsearch.analysis.common.AnalysisPainlessExtension \ No newline at end of file diff --git a/modules/analysis-common/src/main/resources/org/elasticsearch/analysis/common/painless_whitelist.txt b/modules/analysis-common/src/main/resources/org/elasticsearch/analysis/common/painless_whitelist.txt new file mode 100644 index 0000000000000..83b70be58774e --- /dev/null +++ b/modules/analysis-common/src/main/resources/org/elasticsearch/analysis/common/painless_whitelist.txt @@ -0,0 +1,28 @@ +# +# Licensed to Elasticsearch under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +class org.elasticsearch.analysis.common.AnalysisPredicateScript$Token { + CharSequence getTerm() + int getPosition() + int getPositionIncrement() + int getPositionLength() + int getStartOffset() + int getEndOffset() + String getType() + boolean isKeyword() +} \ No newline at end of file diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterTests.java new file mode 100644 index 0000000000000..39134ef1f532b --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ScriptedConditionTokenFilterTests.java @@ -0,0 +1,89 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.TestEnvironment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.script.Script; +import org.elasticsearch.script.ScriptContext; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.test.ESTokenStreamTestCase; +import org.elasticsearch.test.IndexSettingsModule; + +import java.util.Collections; + +public class ScriptedConditionTokenFilterTests extends ESTokenStreamTestCase { + + public void testSimpleCondition() throws Exception { + Settings settings = Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + Settings indexSettings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put("index.analysis.filter.cond.type", "condition") + .put("index.analysis.filter.cond.script.source", "token.getTerm().length() > 5") + .putList("index.analysis.filter.cond.filter", "uppercase") + .put("index.analysis.analyzer.myAnalyzer.type", "custom") + .put("index.analysis.analyzer.myAnalyzer.tokenizer", "standard") + .putList("index.analysis.analyzer.myAnalyzer.filter", "cond") + .build(); + IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); + + AnalysisPredicateScript.Factory factory = () -> new AnalysisPredicateScript() { + @Override + public boolean execute(Token token) { + return token.getTerm().length() > 5; + } + }; + + @SuppressWarnings("unchecked") + ScriptService scriptService = new ScriptService(indexSettings, Collections.emptyMap(), Collections.emptyMap()){ + @Override + public FactoryType compile(Script script, ScriptContext context) { + assertEquals(context, AnalysisPredicateScript.CONTEXT); + assertEquals(new Script("token.getTerm().length() > 5"), script); + return (FactoryType) factory; + } + }; + + CommonAnalysisPlugin plugin = new CommonAnalysisPlugin(); + plugin.createComponents(null, null, null, null, scriptService, null, null, null, null); + AnalysisModule module + = new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(plugin)); + + IndexAnalyzers analyzers = module.getAnalysisRegistry().build(idxSettings); + + try (NamedAnalyzer analyzer = analyzers.get("myAnalyzer")) { + assertNotNull(analyzer); + assertAnalyzesTo(analyzer, "Vorsprung Durch Technik", new String[]{ + "VORSPRUNG", "Durch", "TECHNIK" + }); + } + + } + +} diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml new file mode 100644 index 0000000000000..4305e5db0af37 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/60_analysis_scripting.yml @@ -0,0 +1,36 @@ +## Test analysis scripts + +"condition": + - do: + indices.analyze: + body: + text: "Vorsprung Durch Technik" + tokenizer: "whitespace" + filter: + - type: condition + filter: [ "lowercase" ] + script: + source: "token.term.length() > 5" + + - length: { tokens: 3 } + - match: { tokens.0.token: "vorsprung" } + - match: { tokens.1.token: "Durch" } + - match: { tokens.2.token: "technik" } + +--- +"condition-vars": + - do: + indices.analyze: + body: + text: "Vorsprung Durch Technik" + tokenizer: "whitespace" + filter: + - type: condition + filter: [ "lowercase" ] + script: + source: "token.position > 1 && token.positionIncrement > 0 && token.startOffset > 0 && token.endOffset > 0 && (token.positionLength == 1 || token.type == \"a\" || token.keyword)" + + - length: { tokens: 3 } + - match: { tokens.0.token: "Vorsprung" } + - match: { tokens.1.token: "durch" } + - match: { tokens.2.token: "technik" }