Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to split keyword field on whitespace at query time #30691

Merged
merged 6 commits into from
Jun 1, 2018
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/reference/mapping/types/keyword.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,12 @@ The following parameters are accepted by `keyword` fields:
How to pre-process the keyword prior to indexing. Defaults to `null`,
meaning the keyword is kept as-is.

`split_queries_on_whitespace`::

Whether <<full-text-queries,full text queries>> should split the input on whitespace
when building a query for this field.
Accepts `true` or `false` (default).

NOTE: Indexes imported from 2.x do not support `keyword`. Instead they will
attempt to downgrade `keyword` into `string`. This allows you to merge modern
mappings with legacy mappings. Long lived indexes will have to be recreated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ public Set<Entry<String, NamedAnalyzer>> entrySet() {
return Collections.emptySet();
}
};
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap, analyzerMap)) {
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap, analyzerMap, analyzerMap)) {
MapperService mapperService = new MapperService(indexSettings, fakeIndexAnalzyers, xContentRegistry, similarityService,
mapperRegistry, () -> null);
mapperService.merge(indexMetaData, MapperService.MergeReason.MAPPING_RECOVERY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
Expand Down Expand Up @@ -453,13 +454,16 @@ public IndexAnalyzers build(IndexSettings indexSettings,
analyzerProviders = new HashMap<>(analyzerProviders);
Map<String, NamedAnalyzer> analyzers = new HashMap<>();
Map<String, NamedAnalyzer> normalizers = new HashMap<>();
Map<String, NamedAnalyzer> whitespaceNormalizers = new HashMap<>();
for (Map.Entry<String, AnalyzerProvider<?>> entry : analyzerProviders.entrySet()) {
processAnalyzerFactory(indexSettings, entry.getKey(), entry.getValue(), analyzers,
tokenFilterFactoryFactories, charFilterFactoryFactories, tokenizerFactoryFactories);
}
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
processNormalizerFactory(entry.getKey(), entry.getValue(), normalizers,
tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories);
"keyword", tokenizerFactoryFactories.get("keyword"), tokenFilterFactoryFactories, charFilterFactoryFactories);
processNormalizerFactory(entry.getKey(), entry.getValue(), whitespaceNormalizers,
"whitespace", () -> new WhitespaceTokenizer(), tokenFilterFactoryFactories, charFilterFactoryFactories);
}

if (!analyzers.containsKey("default")) {
Expand Down Expand Up @@ -489,7 +493,7 @@ public IndexAnalyzers build(IndexSettings indexSettings,
}
}
return new IndexAnalyzers(indexSettings, defaultAnalyzer, defaultSearchAnalyzer, defaultSearchQuoteAnalyzer,
unmodifiableMap(analyzers), unmodifiableMap(normalizers));
unmodifiableMap(analyzers), unmodifiableMap(normalizers), unmodifiableMap(whitespaceNormalizers));
}

private void processAnalyzerFactory(IndexSettings indexSettings,
Expand Down Expand Up @@ -545,11 +549,12 @@ private void processNormalizerFactory(
String name,
AnalyzerProvider<?> normalizerFactory,
Map<String, NamedAnalyzer> normalizers,
TokenizerFactory keywordTokenizerFactory,
String tokenizerName,
TokenizerFactory tokenizerFactory,
Map<String, TokenFilterFactory> tokenFilters,
Map<String, CharFilterFactory> charFilters) {
if (normalizerFactory instanceof CustomNormalizerProvider) {
((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters);
((CustomNormalizerProvider) normalizerFactory).build(tokenizerName, tokenizerFactory, charFilters, tokenFilters);
}
Analyzer normalizerF = normalizerFactory.get();
if (normalizerF == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,14 @@ public final class CustomNormalizerProvider extends AbstractIndexAnalyzerProvide
private CustomAnalyzer customAnalyzer;

public CustomNormalizerProvider(IndexSettings indexSettings,
String name, Settings settings) {
String name, Settings settings) {
super(indexSettings, name, settings);
this.analyzerSettings = settings;
}

public void build(final TokenizerFactory keywordTokenizerFactory, final Map<String, CharFilterFactory> charFilters,
public void build(final String tokenizerName, final TokenizerFactory tokenizerFactory, final Map<String, CharFilterFactory> charFilters,
final Map<String, TokenFilterFactory> tokenFilters) {
String tokenizerName = analyzerSettings.get("tokenizer");
if (tokenizerName != null) {
if (analyzerSettings.get("tokenizer") != null) {
throw new IllegalArgumentException("Custom normalizer [" + name() + "] cannot configure a tokenizer");
}

Expand Down Expand Up @@ -82,8 +81,8 @@ public void build(final TokenizerFactory keywordTokenizerFactory, final Map<Stri
}

this.customAnalyzer = new CustomAnalyzer(
"keyword",
keywordTokenizerFactory,
tokenizerName,
tokenizerFactory,
charFiltersList.toArray(new CharFilterFactory[charFiltersList.size()]),
tokenFilterList.toArray(new TokenFilterFactory[tokenFilterList.size()])
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ public final class IndexAnalyzers extends AbstractIndexComponent implements Clos
private final NamedAnalyzer defaultSearchQuoteAnalyzer;
private final Map<String, NamedAnalyzer> analyzers;
private final Map<String, NamedAnalyzer> normalizers;
private final Map<String, NamedAnalyzer> whitespaceNormalizers;
private final IndexSettings indexSettings;

public IndexAnalyzers(IndexSettings indexSettings, NamedAnalyzer defaultIndexAnalyzer, NamedAnalyzer defaultSearchAnalyzer,
NamedAnalyzer defaultSearchQuoteAnalyzer, Map<String, NamedAnalyzer> analyzers,
Map<String, NamedAnalyzer> normalizers) {
Map<String, NamedAnalyzer> normalizers, Map<String, NamedAnalyzer> whitespaceNormalizers) {
super(indexSettings);
if (defaultIndexAnalyzer.name().equals("default") == false) {
throw new IllegalStateException("default analyzer must have the name [default] but was: [" + defaultIndexAnalyzer.name() + "]");
Expand All @@ -54,6 +55,7 @@ public IndexAnalyzers(IndexSettings indexSettings, NamedAnalyzer defaultIndexAna
this.defaultSearchQuoteAnalyzer = defaultSearchQuoteAnalyzer;
this.analyzers = analyzers;
this.normalizers = normalizers;
this.whitespaceNormalizers = whitespaceNormalizers;
this.indexSettings = indexSettings;
}

Expand All @@ -71,6 +73,13 @@ public NamedAnalyzer getNormalizer(String name) {
return normalizers.get(name);
}

/**
* Returns a normalizer that splits on whitespace mapped to the given name or <code>null</code> if not present
*/
public NamedAnalyzer getWhitespaceNormalizer(String name) {
return whitespaceNormalizers.get(name);
}

/**
* Returns the default index analyzer for this index
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.elasticsearch.index.mapper;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedSetDocValuesField;
Expand All @@ -35,6 +36,8 @@
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.support.XContentMapValues;
import org.elasticsearch.index.analysis.AnalyzerScope;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.plain.DocValuesIndexFieldData;
Expand Down Expand Up @@ -73,6 +76,8 @@ public static class Builder extends FieldMapper.Builder<Builder, KeywordFieldMap

protected String nullValue = Defaults.NULL_VALUE;
protected int ignoreAbove = Defaults.IGNORE_ABOVE;
private IndexAnalyzers indexAnalyzers;
private String normalizerName;

public Builder(String name) {
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
Expand Down Expand Up @@ -106,15 +111,36 @@ public Builder eagerGlobalOrdinals(boolean eagerGlobalOrdinals) {
return builder;
}

public Builder normalizer(NamedAnalyzer normalizer) {
fieldType().setNormalizer(normalizer);
fieldType().setSearchAnalyzer(normalizer);
public Builder splitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) {
fieldType().setSplitQueriesOnWhitespace(splitQueriesOnWhitespace);
return builder;
}

public Builder normalizer(IndexAnalyzers indexAnalyzers, String name) {
this.indexAnalyzers = indexAnalyzers;
this.normalizerName = name;
return builder;
}

@Override
public KeywordFieldMapper build(BuilderContext context) {
setupFieldType(context);
if (normalizerName != null) {
NamedAnalyzer normalizer = indexAnalyzers.getNormalizer(normalizerName);
if (normalizer == null) {
throw new MapperParsingException("normalizer [" + normalizerName + "] not found for field [" + name + "]");
}
fieldType().setNormalizer(normalizer);
final NamedAnalyzer searchAnalyzer;
if (fieldType().splitQueriesOnWhitespace) {
searchAnalyzer = indexAnalyzers.getWhitespaceNormalizer(normalizerName);
} else {
searchAnalyzer = normalizer;
}
fieldType().setSearchAnalyzer(searchAnalyzer);
} else if (fieldType().splitQueriesOnWhitespace) {
fieldType().setSearchAnalyzer(new NamedAnalyzer("whitespace", AnalyzerScope.INDEX, new WhitespaceAnalyzer()));
}
return new KeywordFieldMapper(
name, fieldType, defaultFieldType, ignoreAbove,
context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo);
Expand Down Expand Up @@ -147,13 +173,12 @@ public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserCo
iterator.remove();
} else if (propName.equals("normalizer")) {
if (propNode != null) {
NamedAnalyzer normalizer = parserContext.getIndexAnalyzers().getNormalizer(propNode.toString());
if (normalizer == null) {
throw new MapperParsingException("normalizer [" + propNode.toString() + "] not found for field [" + name + "]");
}
builder.normalizer(normalizer);
builder.normalizer(parserContext.getIndexAnalyzers(), propNode.toString());
}
iterator.remove();
} else if (propName.equals("split_queries_on_whitespace")) {
builder.splitQueriesOnWhitespace(XContentMapValues.nodeBooleanValue(propNode, "split_queries_on_whitespace"));
iterator.remove();
}
}
return builder;
Expand All @@ -163,6 +188,7 @@ public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserCo
public static final class KeywordFieldType extends StringFieldType {

private NamedAnalyzer normalizer = null;
private boolean splitQueriesOnWhitespace;

public KeywordFieldType() {
setIndexAnalyzer(Lucene.KEYWORD_ANALYZER);
Expand All @@ -172,6 +198,7 @@ public KeywordFieldType() {
protected KeywordFieldType(KeywordFieldType ref) {
super(ref);
this.normalizer = ref.normalizer;
this.splitQueriesOnWhitespace = splitQueriesOnWhitespace;
}

public KeywordFieldType clone() {
Expand All @@ -183,7 +210,9 @@ public boolean equals(Object o) {
if (super.equals(o) == false) {
return false;
}
return Objects.equals(normalizer, ((KeywordFieldType) o).normalizer);
KeywordFieldType other = (KeywordFieldType) o;
return Objects.equals(normalizer, other.normalizer) &&
splitQueriesOnWhitespace == other.splitQueriesOnWhitespace;
}

@Override
Expand All @@ -197,7 +226,7 @@ public void checkCompatibility(MappedFieldType otherFT, List<String> conflicts)

@Override
public int hashCode() {
return 31 * super.hashCode() + Objects.hashCode(normalizer);
return 31 * super.hashCode() + Objects.hash(normalizer, splitQueriesOnWhitespace);
}

@Override
Expand All @@ -214,6 +243,15 @@ public void setNormalizer(NamedAnalyzer normalizer) {
this.normalizer = normalizer;
}

public boolean splitQueriesOnWhitespace() {
return splitQueriesOnWhitespace;
}

public void setSplitQueriesOnWhitespace(boolean splitQueriesOnWhitespace) {
checkIfFrozen();
this.splitQueriesOnWhitespace = splitQueriesOnWhitespace;
}

@Override
public Query existsQuery(QueryShardContext context) {
if (hasDocValues()) {
Expand Down Expand Up @@ -271,7 +309,8 @@ protected BytesRef indexedValueForSearch(Object value) {
private int ignoreAbove;

protected KeywordFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
int ignoreAbove, Settings indexSettings, MultiFields multiFields, CopyTo copyTo) {
int ignoreAbove, Settings indexSettings,
MultiFields multiFields, CopyTo copyTo) {
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
assert fieldType.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) <= 0;
this.ignoreAbove = ignoreAbove;
Expand Down Expand Up @@ -374,5 +413,9 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults,
} else if (includeDefaults) {
builder.nullField("normalizer");
}

if (includeDefaults || fieldType().splitQueriesOnWhitespace) {
builder.field("split_queries_on_whitespace", fieldType().splitQueriesOnWhitespace);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.mapper.KeywordFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.support.QueryParsers;
Expand Down Expand Up @@ -263,7 +264,8 @@ public Query parse(Type type, String fieldName, Object value) throws IOException
* passing through QueryBuilder.
*/
boolean noForcedAnalyzer = this.analyzer == null;
if (fieldType.tokenized() == false && noForcedAnalyzer) {
if (fieldType.tokenized() == false && noForcedAnalyzer &&
fieldType instanceof KeywordFieldMapper.KeywordFieldType == false) {
return blendTermQuery(new Term(fieldName, value.toString()), fieldType);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ public void testBasics() throws IOException {
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet été-là"});
assertEquals(new BytesRef("cet été-là"), normalizer.normalize("foo", "Cet été-là"));

normalizer = analysis.indexAnalyzers.getWhitespaceNormalizer("my_normalizer");
assertNotNull(normalizer);
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", "Cet été-là"), new String[] {"cet", "été-là"});
assertEquals(new BytesRef("cet été-là"), normalizer.normalize("foo", "Cet été-là"));
}

public void testUnknownType() {
Expand Down Expand Up @@ -86,7 +92,13 @@ public void testCharFilters() throws IOException {
NamedAnalyzer normalizer = analysis.indexAnalyzers.getNormalizer("my_normalizer");
assertNotNull(normalizer);
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", "abc"), new String[] {"zbc"});
assertTokenStreamContents(normalizer.tokenStream("foo", "abc acd"), new String[] {"zbc zcd"});
assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc"));

normalizer = analysis.indexAnalyzers.getWhitespaceNormalizer("my_normalizer");
assertNotNull(normalizer);
assertEquals("my_normalizer", normalizer.name());
assertTokenStreamContents(normalizer.tokenStream("foo", "abc acd"), new String[] {"zbc", "zcd"});
assertEquals(new BytesRef("zbc"), normalizer.normalize("foo", "abc"));
}

Expand Down
Loading