Skip to content

Commit

Permalink
Adding access to noSubMatches and noOverlappingMatches in Hyphenation… (
Browse files Browse the repository at this point in the history
#13895)

* Adding access to noSubMatches and noOverlappingMatches in HyphenationCompoundWordTokenFilter

Signed-off-by: Evan Kielley <evankielley@gmail.com>

* Add Changelog Entry

Signed-off-by: Mohammad Hasnain Mohsin Rajan <hasnain2808@gmail.com>

* test: add hyphenation decompounder tests

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

* test: refactor tests

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

* test: reformat test files

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

* chore: add changelog entry for 2.X

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

* chore: remove 3.x changelog

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

* chore: commonify settingsarr

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

* chore: commonify settingsarr

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

* chore: linting

Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>

---------

Signed-off-by: Evan Kielley <evankielley@gmail.com>
Signed-off-by: Mohammad Hasnain Mohsin Rajan <hasnain2808@gmail.com>
Signed-off-by: Mohammad Hasnain <hasnain2808@gmail.com>
Co-authored-by: Evan Kielley <evankielley@gmail.com>
  • Loading branch information
hasnain2808 and evankielley committed Aug 21, 2024
1 parent 13163ab commit ce64fac
Show file tree
Hide file tree
Showing 6 changed files with 1,313 additions and 16 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897))
- Support filtering on a large list encoded by bitmap ([#14774](https://github.com/opensearch-project/OpenSearch/pull/14774))
- Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153))
- Adding access to noSubMatches and noOverlappingMatches in Hyphenation ([#13895](https://github.com/opensearch-project/OpenSearch/pull/13895))

### Dependencies
- Bump `netty` from 4.1.111.Final to 4.1.112.Final ([#15081](https://github.com/opensearch-project/OpenSearch/pull/15081))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,16 @@
*/
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {

private final boolean noSubMatches;
private final boolean noOverlappingMatches;
private final HyphenationTree hyphenationTree;

HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);

noSubMatches = settings.getAsBoolean("no_sub_matches", false);
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);

String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
if (hyphenationPatternsPath == null) {
throw new IllegalArgumentException("hyphenation_patterns_path is a required setting.");
Expand All @@ -85,7 +90,9 @@ public TokenStream create(TokenStream tokenStream) {
minWordSize,
minSubwordSize,
maxSubwordSize,
onlyLongestMatch
onlyLongestMatch,
noSubMatches,
noOverlappingMatches
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,12 @@
import org.opensearch.test.IndexSettingsModule;
import org.opensearch.test.OpenSearchTestCase;
import org.hamcrest.MatcherAssert;
import org.junit.Before;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand All @@ -63,17 +67,27 @@
import static org.hamcrest.Matchers.instanceOf;

public class CompoundAnalysisTests extends OpenSearchTestCase {

Settings[] settingsArr;

@Before
public void initialize() throws IOException {
final Path home = createTempDir();
copyHyphenationPatternsFile(home);
this.settingsArr = new Settings[] { getJsonSettings(home), getYamlSettings(home) };
}

public void testDefaultsCompoundAnalysis() throws Exception {
Settings settings = getJsonSettings();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
for (Settings settings : this.settingsArr) {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
}
}

public void testDictionaryDecompounder() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
for (Settings settings : this.settingsArr) {
List<String> terms = analyze(settings, "decompoundingAnalyzer", "donaudampfschiff spargelcremesuppe");
MatcherAssert.assertThat(terms.size(), equalTo(8));
MatcherAssert.assertThat(
Expand All @@ -83,6 +97,26 @@ public void testDictionaryDecompounder() throws Exception {
}
}

// Hyphenation Decompounder tests mimic the behavior of lucene tests
// lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java
public void testHyphenationDecompounder() throws Exception {
for (Settings settings : this.settingsArr) {
List<String> terms = analyze(settings, "hyphenationAnalyzer", "min veninde som er lidt af en læsehest");
MatcherAssert.assertThat(terms.size(), equalTo(10));
MatcherAssert.assertThat(terms, hasItems("min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest"));
}
}

// Hyphenation Decompounder tests mimic the behavior of lucene tests
// lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java
public void testHyphenationDecompounderNoSubMatches() throws Exception {
for (Settings settings : this.settingsArr) {
List<String> terms = analyze(settings, "hyphenationAnalyzerNoSubMatches", "basketballkurv");
MatcherAssert.assertThat(terms.size(), equalTo(3));
MatcherAssert.assertThat(terms, hasItems("basketballkurv", "basketball", "kurv"));
}
}

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
Expand Down Expand Up @@ -111,21 +145,28 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
}));
}

private Settings getJsonSettings() throws IOException {
private void copyHyphenationPatternsFile(Path home) throws IOException {
InputStream hyphenation_patterns_path = getClass().getResourceAsStream("da_UTF8.xml");
Path config = home.resolve("config");
Files.createDirectory(config);
Files.copy(hyphenation_patterns_path, config.resolve("da_UTF8.xml"));
}

private Settings getJsonSettings(Path home) throws IOException {
String json = "/org/opensearch/analysis/common/test1.json";
return Settings.builder()
.loadFromStream(json, getClass().getResourceAsStream(json), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
.build();
}

private Settings getYamlSettings() throws IOException {
private Settings getYamlSettings(Path home) throws IOException {
String yaml = "/org/opensearch/analysis/common/test1.yml";
return Settings.builder()
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
.build();
}
}
Loading

0 comments on commit ce64fac

Please sign in to comment.