Skip to content

Commit

Permalink
Add limits for ngram and shingle settings
Browse files Browse the repository at this point in the history
Create index-level settings:
max_ngram_diff - maximum allowed difference between max_gram and min_gram in
NGramTokenFilter/NGramTokenizer. Default is 1.
max_shingle_diff - maximum allowed difference betweenmax_shingle_size and
 min_shingle_size in ShingleTokenFilter.  Default is 3.

Throw an IllegalArgumentException from v7.X when
trying to create NGramTokenFilter, NGramTokenizer, ShingleTokenFilter
where difference between max_size and min_size exceeds the settings value.
Create a deprecated warning for versions < 7.X for this case.

Closes #25887
  • Loading branch information
mayya-sharipova committed Nov 3, 2017
1 parent 0234287 commit e26f6f1
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
Expand Down Expand Up @@ -89,10 +90,15 @@ public NGramTokenizerFactory(IndexSettings indexSettings, Environment environmen
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
int ngramDiff = maxGram - minGram;
if (ngramDiff > maxAllowedNgramDiff) {
throw new IllegalArgumentException(
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
throw new IllegalArgumentException(
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
} else {
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
}
}
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
Expand All @@ -35,14 +36,21 @@ public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment enviro
int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
int shingleDiff = maxShingleSize - minShingleSize;
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);

int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0);
if (shingleDiff > maxAllowedShingleDiff) {
throw new IllegalArgumentException(
"The difference between max_shingle_size and min_shingle_size in Shingle Token Filter must be less than or equal to: ["
+ maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.");
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
throw new IllegalArgumentException(
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.");
} else {
deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter,"
+ "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]");
}
}
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);

Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,9 @@ public void testMaxShingleDiffException() throws Exception{
fail();
} catch (IllegalArgumentException ex) {
assertEquals(
"The difference between max_shingle_size and min_shingle_size in Shingle Token Filter must be less than or equal to: ["
+ maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.",
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.",
ex.getMessage());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.plugins.ScriptPlugin;
import org.elasticsearch.script.ScriptContext;
Expand Down Expand Up @@ -683,6 +684,7 @@ public void testDifferentShardSize() throws Exception {
public void testShardFailures() throws IOException, InterruptedException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
.put("index.analysis.filter.shingler.type", "shingle")
Expand Down Expand Up @@ -743,6 +745,7 @@ public void testEmptyShards() throws IOException, InterruptedException {
endObject();
assertAcked(prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
.put("index.analysis.filter.shingler.type", "shingle")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ type:
|`max_gram` |Defaults to `2`.
|============================

The index level setting `index.max_ngram_diff` controls the maximum allowed
difference between `max_gram` and `min_gram`.

Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,5 @@ used if the position increment is greater than one when a `stop` filter is used
together with the `shingle` filter. Defaults to `"_"`
|=======================================================================

The index level setting `index.max_shingle_diff` controls the maximum allowed
difference between `max_shingle_size` and `min_shingle_size`.
3 changes: 3 additions & 0 deletions docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,9 @@ value. The smaller the length, the more documents will match but the lower
the quality of the matches. The longer the length, the more specific the
matches. A tri-gram (length `3`) is a good place to start.

The index level setting `index.max_ngram_diff` controls the maximum allowed
difference between `max_gram` and `min_gram`.

[float]
=== Example configuration

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.Version;



public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
Expand All @@ -41,10 +43,15 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
int ngramDiff = maxGram - minGram;
if (ngramDiff > maxAllowedNgramDiff) {
throw new IllegalArgumentException(
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
throw new IllegalArgumentException(
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
} else {
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,16 +169,13 @@ public void testMaxNGramDiffException() throws Exception{
int max_gram = min_gram + ngramDiff;

final Settings settings = newAnalysisSettingsBuilder().put("min_gram", min_gram).put("max_gram", max_gram).build();
try {
new NGramTokenizerFactory(indexProperties, null, name, settings).create();
fail();
} catch (IllegalArgumentException ex) {
assertEquals(
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + maxAllowedNgramDiff
+ "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.",
ex.getMessage());
}
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () ->
new NGramTokenizerFactory(indexProperties, null, name, settings).create());
assertEquals(
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.",
ex.getMessage());
}

private Version randomVersion(Random random) throws IllegalArgumentException, IllegalAccessException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@

---
"nGram_exception":
- skip:
version: " - 6.99.99"
reason: only starting from version 7.x this throws an error
- do:
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
indices.analyze:
Expand Down

0 comments on commit e26f6f1

Please sign in to comment.