Skip to content

Commit

Permalink
Merge branch 'main' into integrate_IRC
Browse files Browse the repository at this point in the history
Signed-off-by: Sagar <99425694+sgup432@users.noreply.github.com>
  • Loading branch information
sgup432 authored Mar 5, 2024
2 parents f1b7094 + 11836d0 commit eb31e01
Show file tree
Hide file tree
Showing 12 changed files with 205 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Support for returning scores in matched queries ([#11626](https://github.com/opensearch-project/OpenSearch/pull/11626))
- Add shard id property to SearchLookup for use in field types provided by plugins ([#1063](https://github.com/opensearch-project/OpenSearch/pull/1063))
- [Tiered caching] Integrating IndicesRequestCache with CacheService controlled by a feature flag ([#12533](https://github.com/opensearch-project/OpenSearch/pull/12533))
- Add kuromoji_completion analyzer and filter ([#4835](https://github.com/opensearch-project/OpenSearch/issues/4835))

### Dependencies
- Bump `peter-evans/find-comment` from 2 to 3 ([#12288](https://github.com/opensearch-project/OpenSearch/pull/12288))
Expand All @@ -120,6 +121,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Bump `com.netflix.nebula.ospackage-base` from 11.8.0 to 11.8.1 ([#12461](https://github.com/opensearch-project/OpenSearch/pull/12461))
- Bump `peter-evans/create-or-update-comment` from 3 to 4 ([#12462](https://github.com/opensearch-project/OpenSearch/pull/12462))
- Bump `lycheeverse/lychee-action` from 1.9.1 to 1.9.3 ([#12521](https://github.com/opensearch-project/OpenSearch/pull/12521))
- Bump `com.azure:azure-core` from 1.39.0 to 1.47.0 ([#12520](https://github.com/opensearch-project/OpenSearch/pull/12520))
- Bump `ch.qos.logback:logback-core` from 1.2.13 to 1.5.3 ([#12519](https://github.com/opensearch-project/OpenSearch/pull/12519))

### Changed
- Allow composite aggregation to run under a parent filter aggregation ([#11499](https://github.com/opensearch-project/OpenSearch/pull/11499))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.index.analysis;

import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
import org.apache.lucene.analysis.ja.dict.UserDictionary;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;

public class KuromojiCompletionAnalyzerProvider extends AbstractIndexAnalyzerProvider<JapaneseCompletionAnalyzer> {

private final JapaneseCompletionAnalyzer analyzer;

public KuromojiCompletionAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
final JapaneseCompletionFilter.Mode mode = KuromojiCompletionFilterFactory.getMode(settings);
final UserDictionary userDictionary = KuromojiTokenizerFactory.getUserDictionary(env, settings);
analyzer = new JapaneseCompletionAnalyzer(userDictionary, mode);
}

@Override
public JapaneseCompletionAnalyzer get() {
return this.analyzer;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.index.analysis;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter;
import org.apache.lucene.analysis.ja.JapaneseCompletionFilter.Mode;
import org.opensearch.common.settings.Settings;
import org.opensearch.env.Environment;
import org.opensearch.index.IndexSettings;

public class KuromojiCompletionFilterFactory extends AbstractTokenFilterFactory {
private final Mode mode;

public KuromojiCompletionFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.mode = getMode(settings);
}

public static Mode getMode(Settings settings) {
String modeSetting = settings.get("mode", null);
if (modeSetting != null) {
if ("index".equalsIgnoreCase(modeSetting)) {
return Mode.INDEX;
} else if ("query".equalsIgnoreCase(modeSetting)) {
return Mode.QUERY;
}
}
return Mode.INDEX;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new JapaneseCompletionFilter(tokenStream, mode);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import org.opensearch.index.analysis.JapaneseStopTokenFilterFactory;
import org.opensearch.index.analysis.KuromojiAnalyzerProvider;
import org.opensearch.index.analysis.KuromojiBaseFormFilterFactory;
import org.opensearch.index.analysis.KuromojiCompletionAnalyzerProvider;
import org.opensearch.index.analysis.KuromojiCompletionFilterFactory;
import org.opensearch.index.analysis.KuromojiIterationMarkCharFilterFactory;
import org.opensearch.index.analysis.KuromojiKatakanaStemmerFactory;
import org.opensearch.index.analysis.KuromojiNumberFilterFactory;
Expand Down Expand Up @@ -70,6 +72,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
extra.put("kuromoji_stemmer", KuromojiKatakanaStemmerFactory::new);
extra.put("ja_stop", JapaneseStopTokenFilterFactory::new);
extra.put("kuromoji_number", KuromojiNumberFilterFactory::new);
extra.put("kuromoji_completion", KuromojiCompletionFilterFactory::new);
return extra;
}

Expand All @@ -80,6 +83,9 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {

@Override
public Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
return singletonMap("kuromoji", KuromojiAnalyzerProvider::new);
Map<String, AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();
extra.put("kuromoji", KuromojiAnalyzerProvider::new);
extra.put("kuromoji_completion", KuromojiCompletionAnalyzerProvider::new);
return extra;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ protected Map<String, Class<?>> getTokenFilters() {
filters.put("japanesereadingform", KuromojiReadingFormFilterFactory.class);
filters.put("japanesekatakanastem", KuromojiKatakanaStemmerFactory.class);
filters.put("japanesenumber", KuromojiNumberFilterFactory.class);
filters.put("japanesecompletion", KuromojiCompletionFilterFactory.class);
return filters;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ja.JapaneseAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseCompletionAnalyzer;
import org.apache.lucene.analysis.ja.JapaneseTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.opensearch.Version;
Expand Down Expand Up @@ -85,6 +86,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
filterFactory = analysis.tokenFilter.get("kuromoji_number");
assertThat(filterFactory, instanceOf(KuromojiNumberFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion_index");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

filterFactory = analysis.tokenFilter.get("kuromoji_completion_query");
assertThat(filterFactory, instanceOf(KuromojiCompletionFilterFactory.class));

IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji");
assertThat(analyzer.analyzer(), instanceOf(JapaneseAnalyzer.class));
Expand All @@ -93,6 +103,15 @@ public void testDefaultsKuromojiAnalysis() throws IOException {
assertThat(analyzer.analyzer(), instanceOf(CustomAnalyzer.class));
assertThat(analyzer.analyzer().tokenStream(null, new StringReader("")), instanceOf(JapaneseTokenizer.class));

analyzer = indexAnalyzers.get("kuromoji_completion");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

analyzer = indexAnalyzers.get("kuromoji_completion_index");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

analyzer = indexAnalyzers.get("kuromoji_completion_query");
assertThat(analyzer.analyzer(), instanceOf(JapaneseCompletionAnalyzer.class));

CharFilterFactory charFilterFactory = analysis.charFilter.get("kuromoji_iteration_mark");
assertThat(charFilterFactory, instanceOf(KuromojiIterationMarkCharFilterFactory.class));

Expand Down Expand Up @@ -199,6 +218,32 @@ public void testKatakanaStemFilter() throws IOException {
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}

public void testJapaneseCompletionFilter() throws IOException {
TestAnalysis analysis = createTestAnalysis();

String source = "寿司がおいしいね";
String[] expected_tokens = new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" };

// mode = INDEX(default)
Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_completion");
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);

// mode = INDEX
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_index");
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);

// mode = QUERY
tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
tokenizer.setReader(new StringReader(source));
tokenFilter = analysis.tokenFilter.get("kuromoji_completion_query");
expected_tokens = new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" };
assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens);
}

public void testIterationMarkCharFilter() throws IOException {
TestAnalysis analysis = createTestAnalysis();
// test only kanji
Expand Down Expand Up @@ -414,6 +459,30 @@ public void testDiscardCompoundToken() throws Exception {
assertSimpleTSOutput(tokenizer, expected);
}

public void testJapaneseCompletionAnalyzer() throws Exception {
TestAnalysis analysis = createTestAnalysis();
IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers;
NamedAnalyzer analyzer = indexAnalyzers.get("kuromoji_completion");

// mode = INDEX(default)
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" });
}

// mode = INDEX
analyzer = indexAnalyzers.get("kuromoji_completion_index");
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "が", "ga", "おいしい", "oisii", "oishii", "ね", "ne" });
}

// mode = QUERY
analyzer = indexAnalyzers.get("kuromoji_completion_query");
try (TokenStream stream = analyzer.tokenStream("", "寿司がおいしいね")) {
assertTokenStreamContents(stream, new String[] { "寿司", "susi", "sushi", "がおいしいね", "gaoisiine", "gaoishiine" });
}

}

private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException {
InputStream dict = KuromojiAnalysisTests.class.getResourceAsStream("user_dict.txt");
Path home = createTempDir();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@
"ja_stop" : {
"type": "ja_stop",
"stopwords": ["_japanese_", "スピード"]
},
"kuromoji_completion_index" : {
"type" : "kuromoji_completion",
"mode" : "index"
},
"kuromoji_completion_query" : {
"type" : "kuromoji_completion",
"mode" : "query"
}
},

Expand Down Expand Up @@ -70,6 +78,14 @@
"my_analyzer" : {
"type" : "custom",
"tokenizer" : "kuromoji_tokenizer"
},
"kuromoji_completion_index" : {
"type" : "kuromoji_completion",
"mode" : "index"
},
"kuromoji_completion_query" : {
"type" : "kuromoji_completion",
"mode" : "query"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,24 @@
- match: { tokens.5.token: 飲む }
- match: { tokens.6.token: 行く }
---
"Completion Analyzer":
- do:
indices.analyze:
body:
text: 寿司がおいしいね
analyzer: kuromoji_completion
- length: { tokens: 10 }
- match: { tokens.0.token: "寿司" }
- match: { tokens.1.token: "susi" }
- match: { tokens.2.token: "sushi" }
- match: { tokens.3.token: "が" }
- match: { tokens.4.token: "ga" }
- match: { tokens.5.token: "おいしい" }
- match: { tokens.6.token: "oisii" }
- match: { tokens.7.token: "oishii" }
- match: { tokens.8.token: "ね" }
- match: { tokens.9.token: "ne" }
---
"Tokenizer":
- do:
indices.analyze:
Expand Down Expand Up @@ -57,3 +75,15 @@
filter: [kuromoji_stemmer]
- length: { tokens: 1 }
- match: { tokens.0.token: サーバ }
---
"Completion filter":
- do:
indices.analyze:
body:
text: 寿司
tokenizer: kuromoji_tokenizer
filter: [kuromoji_completion]
- length: { tokens: 3 }
- match: { tokens.0.token: "寿司" }
- match: { tokens.1.token: "susi" }
- match: { tokens.2.token: "sushi" }
2 changes: 1 addition & 1 deletion plugins/repository-azure/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ opensearchplugin {
}

dependencies {
api 'com.azure:azure-core:1.39.0'
api 'com.azure:azure-core:1.47.0'
api 'com.azure:azure-json:1.0.1'
api 'com.azure:azure-storage-common:12.21.2'
api 'com.azure:azure-core-http-netty:1.12.8'
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
6b300175826f0bb0916fca2fa5f70885b716e93f
2 changes: 1 addition & 1 deletion test/fixtures/hdfs-fixture/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ dependencies {
api 'org.apache.zookeeper:zookeeper:3.9.1'
api "org.apache.commons:commons-text:1.11.0"
api "commons-net:commons-net:3.10.0"
api "ch.qos.logback:logback-core:1.2.13"
api "ch.qos.logback:logback-core:1.5.3"
api "ch.qos.logback:logback-classic:1.2.13"
api 'org.apache.kerby:kerb-admin:2.0.3'
runtimeOnly "com.google.guava:guava:${versions.guava}"
Expand Down

0 comments on commit eb31e01

Please sign in to comment.