-
Notifications
You must be signed in to change notification settings - Fork 24.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add conditional token filter to elasticsearch (#31958)
This allows tokenfilters to be applied selectively, depending on the status of the current token in the tokenstream. The filter takes a scripted predicate, and only applies its subfilter when the predicate returns true.
- Loading branch information
1 parent
aff5658
commit 81014e0
Showing
13 changed files
with
578 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
43 changes: 43 additions & 0 deletions
43
docs/painless/painless-contexts/painless-analysis-predicate-context.asciidoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
[[painless-analysis-predicate-context]] | ||
=== Analysis Predicate Context | ||
|
||
Use a painless script to determine whether or not the current token in an | ||
analysis chain matches a predicate. | ||
|
||
*Variables* | ||
|
||
`params` (`Map`, read-only):: | ||
User-defined parameters passed in as part of the query. | ||
|
||
`token.term` (`CharSequence`, read-only):: | ||
The characters of the current token | ||
|
||
`token.position` (`int`, read-only):: | ||
The position of the current token | ||
|
||
`token.positionIncrement` (`int`, read-only):: | ||
The position increment of the current token | ||
|
||
`token.positionLength` (`int`, read-only):: | ||
The position length of the current token | ||
|
||
`token.startOffset` (`int`, read-only):: | ||
The start offset of the current token | ||
|
||
`token.endOffset` (`int`, read-only):: | ||
The end offset of the current token | ||
|
||
`token.type` (`String`, read-only):: | ||
The type of the current token | ||
|
||
`token.keyword` ('boolean`, read-only):: | ||
Whether or not the current token is marked as a keyword | ||
|
||
*Return* | ||
|
||
`boolean`:: | ||
Whether or not the current token matches the predicate | ||
|
||
*API* | ||
|
||
The standard <<painless-api-reference, Painless API>> is available. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
90 changes: 90 additions & 0 deletions
90
docs/reference/analysis/tokenfilters/condition-tokenfilter.asciidoc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
[[analysis-condition-tokenfilter]] | ||
=== Conditional Token Filter | ||
|
||
The conditional token filter takes a predicate script and a list of subfilters, and | ||
only applies the subfilters to the current token if it matches the predicate. | ||
|
||
[float] | ||
=== Options | ||
[horizontal] | ||
filter:: a chain of token filters to apply to the current token if the predicate | ||
matches. These can be any token filters defined elsewhere in the index mappings. | ||
|
||
script:: a predicate script that determines whether or not the filters will be applied | ||
to the current token. Note that only inline scripts are supported | ||
|
||
[float] | ||
=== Settings example | ||
|
||
You can set it up like: | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
PUT /condition_example | ||
{ | ||
"settings" : { | ||
"analysis" : { | ||
"analyzer" : { | ||
"my_analyzer" : { | ||
"tokenizer" : "standard", | ||
"filter" : [ "my_condition" ] | ||
} | ||
}, | ||
"filter" : { | ||
"my_condition" : { | ||
"type" : "condition", | ||
"filter" : [ "lowercase" ], | ||
"script" : { | ||
"source" : "token.getTerm().length() < 5" <1> | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
-------------------------------------------------- | ||
// CONSOLE | ||
|
||
<1> This will only apply the lowercase filter to terms that are less than 5 | ||
characters in length | ||
|
||
And test it like: | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
POST /condition_example/_analyze | ||
{ | ||
"analyzer" : "my_analyzer", | ||
"text" : "What Flapdoodle" | ||
} | ||
-------------------------------------------------- | ||
// CONSOLE | ||
// TEST[continued] | ||
|
||
And it'd respond: | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
{ | ||
"tokens": [ | ||
{ | ||
"token": "what", <1> | ||
"start_offset": 0, | ||
"end_offset": 4, | ||
"type": "<ALPHANUM>", | ||
"position": 0 | ||
}, | ||
{ | ||
"token": "Flapdoodle", <2> | ||
"start_offset": 5, | ||
"end_offset": 15, | ||
"type": "<ALPHANUM>", | ||
"position": 1 | ||
} | ||
] | ||
} | ||
-------------------------------------------------- | ||
// TESTRESPONSE | ||
<1> The term `What` has been lowercased, because it is only 4 characters long | ||
<2> The term `Flapdoodle` has been left in its original case, because it doesn't pass | ||
the predicate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
40 changes: 40 additions & 0 deletions
40
...sis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPainlessExtension.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.analysis.common; | ||
|
||
import org.elasticsearch.painless.spi.PainlessExtension; | ||
import org.elasticsearch.painless.spi.Whitelist; | ||
import org.elasticsearch.painless.spi.WhitelistLoader; | ||
import org.elasticsearch.script.ScriptContext; | ||
|
||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
public class AnalysisPainlessExtension implements PainlessExtension { | ||
|
||
private static final Whitelist WHITELIST = | ||
WhitelistLoader.loadFromResourceFiles(AnalysisPainlessExtension.class, "painless_whitelist.txt"); | ||
|
||
@Override | ||
public Map<ScriptContext<?>, List<Whitelist>> getContextWhitelists() { | ||
return Collections.singletonMap(AnalysisPredicateScript.CONTEXT, Collections.singletonList(WHITELIST)); | ||
} | ||
} |
87 changes: 87 additions & 0 deletions
87
...lysis-common/src/main/java/org/elasticsearch/analysis/common/AnalysisPredicateScript.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.analysis.common; | ||
|
||
import org.elasticsearch.script.ScriptContext; | ||
|
||
/** | ||
* A predicate based on the current token in a TokenStream | ||
*/ | ||
public abstract class AnalysisPredicateScript { | ||
|
||
/** | ||
* Encapsulation of the state of the current token | ||
*/ | ||
public static class Token { | ||
public CharSequence term; | ||
public int pos; | ||
public int posInc; | ||
public int posLen; | ||
public int startOffset; | ||
public int endOffset; | ||
public String type; | ||
public boolean isKeyword; | ||
|
||
public CharSequence getTerm() { | ||
return term; | ||
} | ||
|
||
public int getPositionIncrement() { | ||
return posInc; | ||
} | ||
|
||
public int getPosition() { | ||
return pos; | ||
} | ||
|
||
public int getPositionLength() { | ||
return posLen; | ||
} | ||
|
||
public int getStartOffset() { | ||
return startOffset; | ||
} | ||
|
||
public int getEndOffset() { | ||
return endOffset; | ||
} | ||
|
||
public String getType() { | ||
return type; | ||
} | ||
|
||
public boolean isKeyword() { | ||
return isKeyword; | ||
} | ||
} | ||
|
||
/** | ||
* Returns {@code true} if the current term matches the predicate | ||
*/ | ||
public abstract boolean execute(Token token); | ||
|
||
public interface Factory { | ||
AnalysisPredicateScript newInstance(); | ||
} | ||
|
||
public static final String[] PARAMETERS = new String[]{ "token" }; | ||
public static final ScriptContext<Factory> CONTEXT = new ScriptContext<>("analysis", Factory.class); | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.