Skip to content

Commit

Permalink
Merge pull request #1268 from b2ihealthcare/issue/SO-5916-syonym-term…
Browse files Browse the repository at this point in the history
…-search

SO-5916: Search term with ES synonym does not result in a match
  • Loading branch information
cmark authored Mar 20, 2024
2 parents 8333f0a + 07203c8 commit 802b50d
Show file tree
Hide file tree
Showing 9 changed files with 207 additions and 69 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
<setEntry value="org.apache.commons.commons-codec@default:default"/>
<setEntry value="org.apache.commons.commons-collections4@default:default"/>
<setEntry value="org.apache.commons.commons-compress@default:default"/>
<setEntry value="org.apache.commons.commons-io@default:default"/>
<setEntry value="org.apache.commons.lang3@default:default"/>
<setEntry value="org.apache.commons.logging@default:default"/>
<setEntry value="org.apache.httpcomponents.httpasyncclient@default:default"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
<setEntry value="org.apache.commons.commons-codec@default:default"/>
<setEntry value="org.apache.commons.commons-collections4@default:default"/>
<setEntry value="org.apache.commons.commons-compress@default:default"/>
<setEntry value="org.apache.commons.commons-io@default:default"/>
<setEntry value="org.apache.commons.lang3@default:default"/>
<setEntry value="org.apache.commons.logging@default:default"/>
<setEntry value="org.apache.httpcomponents.httpasyncclient@default:default"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,49 @@ public boolean equals(Object obj) {

}

@Doc
private static final class DataWithTokenizedTextSearchAnalyzer {

@ID
private String id;

@Field(
aliases = {
@FieldAlias(name = "tokenized", type = FieldAliasType.TEXT, analyzer = Analyzers.TOKENIZED, searchAnalyzer = Analyzers.TOKENIZED_SYNONYMS)
}
)
private String text;

@JsonCreator
public DataWithTokenizedTextSearchAnalyzer(@JsonProperty("id") String id, @JsonProperty("text") String text) {
this.id = id;
this.text = text;
}

public String getText() {
return text;
}

@Override
public int hashCode() {
return Objects.hash(id, text);
}

@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
DataWithTokenizedTextSearchAnalyzer other = (DataWithTokenizedTextSearchAnalyzer) obj;
return Objects.equals(id, other.id)
&& Objects.equals(text, other.text);
}

}

@Override
protected Collection<Class<?>> getTypes() {
return List.of(DataWithTokenizedText.class);
return List.of(DataWithTokenizedText.class, DataWithTokenizedTextSearchAnalyzer.class);
}

@Test
Expand Down Expand Up @@ -118,12 +158,46 @@ public void tokenizedWithSynonyms() throws Exception {
// without specific synonym analyzer searches for synonyms return no hits
Hits<DataWithTokenizedText> hits = search(Query.select(DataWithTokenizedText.class)
.where(Expressions.dismax(
Expressions.matchTextAll("text.tokenized", "barbecue").withSynonyms(true),
Expressions.matchTextAll("text.tokenized", "barbecue").withSynonymsEnabled(true),
Expressions.matchTextAll("text.tokenized", "calculus")
))
.limit(Integer.MAX_VALUE)
.build());
assertThat(hits).containsOnly(bbq);
}

@Test
public void tokenizedWithSynonymsDefaultSearchAnalyzer() throws Exception {
DataWithTokenizedTextSearchAnalyzer bbq = new DataWithTokenizedTextSearchAnalyzer(KEY1, "bbq weekend");
DataWithTokenizedTextSearchAnalyzer stone = new DataWithTokenizedTextSearchAnalyzer(KEY2, "kidney stone");
indexDocuments(bbq, stone);

// without specific synonym analyzer searches for synonyms return no hits
Hits<DataWithTokenizedTextSearchAnalyzer> hits = search(Query.select(DataWithTokenizedTextSearchAnalyzer.class)
.where(Expressions.dismax(
Expressions.matchTextAll("text.tokenized", "barbecue").withIgnoreStopwords(false),
Expressions.matchTextAll("text.tokenized", "calculus").withSynonymsEnabled(false).withIgnoreStopwords(false)
))
.limit(Integer.MAX_VALUE)
.build());
assertThat(hits).containsOnly(bbq);
}

@Test
public void tokenizedWithIgnoringStopwords() throws Exception {
DataWithTokenizedTextSearchAnalyzer bbq = new DataWithTokenizedTextSearchAnalyzer(KEY1, "bbq weekend");
DataWithTokenizedTextSearchAnalyzer stone = new DataWithTokenizedTextSearchAnalyzer(KEY2, "kidney stone");
indexDocuments(bbq, stone);

// without specific synonym analyzer searches for synonyms return no hits
Hits<DataWithTokenizedTextSearchAnalyzer> hits = search(Query.select(DataWithTokenizedTextSearchAnalyzer.class)
.where(Expressions.dismax(
Expressions.matchTextAll("text.tokenized", "bbq and").withSynonymsEnabled(false).withIgnoreStopwords(true),
Expressions.matchTextAll("text.tokenized", "calculus").withIgnoreStopwords(true)
))
.limit(Integer.MAX_VALUE)
.build());
assertThat(hits).containsOnly(bbq);
}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2011-2023 B2i Healthcare, https://b2ihealthcare.com
* Copyright 2011-2024 B2i Healthcare, https://b2ihealthcare.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,6 +29,7 @@
import org.slf4j.Logger;

import com.b2international.commons.exceptions.FormattedRuntimeException;
import com.b2international.index.Analyzers;
import com.b2international.index.IndexClientFactory;
import com.b2international.index.compat.TextConstants;
import com.b2international.index.mapping.DocumentMapping;
Expand Down Expand Up @@ -276,20 +277,23 @@ private void visit(TextPredicate predicate) {
final String term = predicate.term();
final MatchType type = predicate.type();
final int minShouldMatch = predicate.minShouldMatch();

QueryBuilder query;

String searchAnalyzer = selectSearchAnalyzer(predicate);
switch (type) {
case BOOLEAN_PREFIX:
query = QueryBuilders.matchBoolPrefixQuery(field, term)
.analyzer(predicate.analyzer())
.analyzer(searchAnalyzer)
.operator(Operator.AND);
break;
case PHRASE:
query = QueryBuilders.matchPhraseQuery(field, term)
.analyzer(predicate.analyzer());
.analyzer(searchAnalyzer);
break;
case ALL:
MatchQueryBuilder all = QueryBuilders.matchQuery(field, term)
.analyzer(predicate.analyzer())
.analyzer(searchAnalyzer)
.operator(Operator.AND);
if (!Strings.isNullOrEmpty(predicate.fuzziness())) {
all
Expand All @@ -301,7 +305,7 @@ private void visit(TextPredicate predicate) {
break;
case ANY:
MatchQueryBuilder any = QueryBuilders.matchQuery(field, term)
.analyzer(predicate.analyzer())
.analyzer(searchAnalyzer)
.operator(Operator.OR)
.minimumShouldMatch(Integer.toString(minShouldMatch));
if (!Strings.isNullOrEmpty(predicate.fuzziness())) {
Expand All @@ -314,7 +318,7 @@ private void visit(TextPredicate predicate) {
break;
case PARSED:
query = QueryBuilders.queryStringQuery(TextConstants.escape(term))
.analyzer(predicate.analyzer())
.analyzer(searchAnalyzer)
.field(field)
.escape(false)
.allowLeadingWildcard(true)
Expand All @@ -330,6 +334,38 @@ private void visit(TextPredicate predicate) {
deque.push(query);
}

private String selectSearchAnalyzer(TextPredicate predicate) {
final Boolean enableSynonym = predicate.synonymsEnabled();
final Boolean ignoreStopWords = predicate.ignoreStopwords();

if (predicate.analyzer() != null ) {
return predicate.analyzer();
}

Analyzers searchAnalyzer = null;
if (enableSynonym != null) {
if (enableSynonym) {
searchAnalyzer = Analyzers.TOKENIZED_SYNONYMS;
} else {
searchAnalyzer = Analyzers.TOKENIZED;
}
}

if (ignoreStopWords != null) {
if (ignoreStopWords) {
if (searchAnalyzer == null || searchAnalyzer == Analyzers.TOKENIZED) {
searchAnalyzer = Analyzers.TOKENIZED_IGNORE_STOPWORDS;
} else if (searchAnalyzer == Analyzers.TOKENIZED_SYNONYMS) {
searchAnalyzer = Analyzers.TOKENIZED_SYNONYMS_IGNORE_STOPWORDS;
} else {
throw new IllegalArgumentException("Unsupported analyzer configuration");
}
}
}

return searchAnalyzer == null ? null : searchAnalyzer.getAnalyzer();
}

private void visit(SingleArgumentPredicate<?> predicate) {
deque.push(QueryBuilders.termQuery(toFieldPath(predicate), predicate.getArgument()));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2011-2023 B2i Healthcare, https://b2ihealthcare.com
* Copyright 2011-2024 B2i Healthcare, https://b2ihealthcare.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,19 +37,24 @@
*/
public class Expressions {

// special
// special constants used to mark up dynamic expressions for dynamic field matching in #matchDynamic expression builder
// separates key-value pairs in a dynamic field expression (if not used field exist expression will be used for the value)
private static final String DYNAMIC_VALUE_DELIMITER = "#";

// special in: syntax support
private static final String IN_CLAUSE = "in:";
private static final Splitter COMMA_SPLITTER = Splitter.on(",");
private static final Joiner COMMA_JOINER = Joiner.on(",");

// special range: syntax support
private static final String RANGE_CLAUSE = "range:";
private static final String RANGE_EXPRESSION_DELIMITER = "..";
private static final Splitter RANGE_SPLITTER = Splitter.on(RANGE_EXPRESSION_DELIMITER);
private static final Joiner RANGE_JOINER = Joiner.on(RANGE_EXPRESSION_DELIMITER);
// dynamic prefix match character
private static final String DYNAMIC_PREFIX_MATCH_CHAR = "*";

// multi-valued dynamic term matching constants, expected format is 'in:VALUE_1,VALUE_2,VALUE_N'
private static final String DYNAMIC_IN_CLAUSE = "in:";
private static final String DYNAMIC_IN_VALUES_SEPARATOR = ",";
private static final Splitter DYNAMIC_IN_VALUES_SPLITTER = Splitter.on(DYNAMIC_IN_VALUES_SEPARATOR);
private static final Joiner DYNAMIC_IN_VALUES_JOINER = Joiner.on(DYNAMIC_IN_VALUES_SEPARATOR);

// dynamic range field matching constants, expected format is 'range:lower..upper' where lower and upper parts are optional
private static final String DYNAMIC_RANGE_CLAUSE = "range:";
private static final String DYNAMIC_RANGE_VALUES_SEPARATOR = "..";
private static final Splitter DYNAMIC_RANGE_SPLITTER = Splitter.on(DYNAMIC_RANGE_VALUES_SEPARATOR);
private static final Joiner DYNAMIC_RANGE_JOINER = Joiner.on(DYNAMIC_RANGE_VALUES_SEPARATOR);

public static final class ExpressionBuilder extends AbstractExpressionBuilder<ExpressionBuilder> {

Expand Down Expand Up @@ -308,12 +313,12 @@ public static Expression matchDynamic(String field, Collection<String> dynamicFi
final String fieldToMatch = String.join(".", field, propertyName);

// check special syntax first
if (propertyValue.startsWith(IN_CLAUSE)) {
if (propertyValue.startsWith(DYNAMIC_IN_CLAUSE)) {
// multi-valued -> terms query
bool.filter(Expressions.matchAny(fieldToMatch, COMMA_SPLITTER.splitToList(propertyValue.substring(IN_CLAUSE.length()))));
} else if (propertyValue.startsWith(RANGE_CLAUSE)) {
bool.filter(Expressions.matchAny(fieldToMatch, DYNAMIC_IN_VALUES_SPLITTER.splitToList(propertyValue.substring(DYNAMIC_IN_CLAUSE.length()))));
} else if (propertyValue.startsWith(DYNAMIC_RANGE_CLAUSE)) {
// range -> range query
final List<String> rangeValues = RANGE_SPLITTER.splitToList(propertyValue.substring(RANGE_CLAUSE.length()));
final List<String> rangeValues = DYNAMIC_RANGE_SPLITTER.splitToList(propertyValue.substring(DYNAMIC_RANGE_CLAUSE.length()));
if (rangeValues.size() > 2) {
throw new BadRequestException("Multiple range expressions (<min>..<max>) are found in property value match. Only a single <min>..<max> expression is allowed.", propertyValue);
} else if (rangeValues.size() <= 1) {
Expand All @@ -323,7 +328,7 @@ public static Expression matchDynamic(String field, Collection<String> dynamicFi
String upper = Strings.emptyToNull(rangeValues.get(1));
bool.filter(Expressions.matchRange(fieldToMatch, lower, upper));
}
} else if (propertyValue.endsWith("*")) {
} else if (propertyValue.endsWith(DYNAMIC_PREFIX_MATCH_CHAR)) {
// wildcard at the end -> prefix query
bool.filter(Expressions.prefixMatch(fieldToMatch, propertyValue.substring(0, (propertyValue.length() - 1))));
} else {
Expand All @@ -346,10 +351,10 @@ public static String toDynamicFieldFilter(String field, Object value) {
final String filterValue;
if (value instanceof Iterable<?> iterable) {
// to valid in: expression if it is an iterable type
filterValue = IN_CLAUSE.concat(COMMA_JOINER.join(iterable));
filterValue = DYNAMIC_IN_CLAUSE.concat(DYNAMIC_IN_VALUES_JOINER.join(iterable));
} else if (value instanceof Range<?> range) {
// or to range: expression if is is a Guava Range object
filterValue = RANGE_CLAUSE.concat(RANGE_JOINER.join(range.lowerEndpoint(), range.upperEndpoint()));
filterValue = DYNAMIC_RANGE_CLAUSE.concat(DYNAMIC_RANGE_JOINER.join(range.lowerEndpoint(), range.upperEndpoint()));
} else {
// otherwise convert it to string to be able to join with the field name
filterValue = String.valueOf(value);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2011-2022 B2i Healthcare, https://b2ihealthcare.com
* Copyright 2011-2024 B2i Healthcare, https://b2ihealthcare.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,6 +32,8 @@ public enum MatchType {
private final int minShouldMatch;

private Analyzers analyzer;
private Boolean synonymsEnabled;
private Boolean ignoreStopwords;

private String fuzziness;
private int maxExpansions = 10;
Expand Down Expand Up @@ -81,6 +83,24 @@ public int prefixLength() {
return prefixLength;
}

public Boolean synonymsEnabled() {
return synonymsEnabled;
}

public TextPredicate withSynonymsEnabled(Boolean synonymsEnabled) {
this.synonymsEnabled = synonymsEnabled;
return this;
}

public Boolean ignoreStopwords() {
return ignoreStopwords;
}

public TextPredicate withIgnoreStopwords(Boolean ignoreStopwords) {
this.ignoreStopwords = ignoreStopwords;
return this;
}

public TextPredicate withFuzziness(String fuzziness) {
return withFuzziness(fuzziness, prefixLength, maxExpansions);
}
Expand All @@ -97,24 +117,4 @@ public String toString() {
return String.format("TEXT(%s %s '%s'[])", getField(), type(), term(), CompareUtils.isEmpty(analyzer));
}

public TextPredicate withIgnoreStopwords(boolean ignoreStopwords) {
if (ignoreStopwords) {
return withAnalyzer((analyzer == Analyzers.TOKENIZED_SYNONYMS || analyzer == Analyzers.TOKENIZED_SYNONYMS_IGNORE_STOPWORDS) ? Analyzers.TOKENIZED_SYNONYMS_IGNORE_STOPWORDS : Analyzers.TOKENIZED_IGNORE_STOPWORDS);
} else {
return withAnalyzer(analyzer == Analyzers.TOKENIZED_SYNONYMS_IGNORE_STOPWORDS ? Analyzers.TOKENIZED_SYNONYMS : Analyzers.TOKENIZED);
}
}

public TextPredicate withSynonyms(Boolean enableSynonyms) {
// if enableSynonyms is not a valid boolean value keep it unchanged, use the default set in the mapping
if (enableSynonyms == null) {
return this;
}
if (enableSynonyms) {
return withAnalyzer((analyzer == Analyzers.TOKENIZED_IGNORE_STOPWORDS || analyzer == Analyzers.TOKENIZED_SYNONYMS_IGNORE_STOPWORDS) ? Analyzers.TOKENIZED_SYNONYMS_IGNORE_STOPWORDS : Analyzers.TOKENIZED_SYNONYMS);
} else {
return withAnalyzer(analyzer == Analyzers.TOKENIZED_SYNONYMS_IGNORE_STOPWORDS ? Analyzers.TOKENIZED_IGNORE_STOPWORDS : Analyzers.TOKENIZED);
}
}

}
Loading

0 comments on commit 802b50d

Please sign in to comment.