Update Lindera (#76)

* Update Lindear * Fix format * Update CHANGES.md
lindera · Apr 30, 2023 · 90b9164 · 90b9164
1 parent 1e2a8b2
commit 90b9164
Show file tree

Hide file tree

Showing 11 changed files with 73 additions and 89 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,9 @@
 All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## 0.24.0 (2023-04-40)
+- Update Lindera #76 @mosuka
+
 ## 0.23.1 (2023-04-07)
 - Update Lindera #74 @mosuka
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lindera-tantivy"
-version = "0.23.1"
+version = "0.24.0"
 edition = "2021"
 description = "Lindera Tokenizer for Tantivy."
 documentation = "https://docs.rs/lindera-tantivy"
@@ -12,20 +12,22 @@ categories = ["text-processing"]
 license = "MIT"
 
 [features]
-default = ["ipadic"]  # Japanese dictionary
-ipadic = ["lindera/ipadic"]  # Japanese dictionary
-unidic = ["lindera/unidic"]  # Japanese dictionary
-ko-dic = ["lindera/ko-dic"]  # Korean dictionary
-cc-cedict = ["lindera/cc-cedict"]  # Chinese dictionary
-ipadic-compress = ["lindera/ipadic-compress"]
-unidic-compress = ["lindera/unidic-compress"]
-ko-dic-compress = ["lindera/ko-dic-compress"]
-cc-cedict-compress = ["lindera/cc-cedict-compress"]
+default = []  # Japanese dictionary
+ipadic = ["lindera-tokenizer/ipadic"]  # Japanese dictionary
+unidic = ["lindera-tokenizer/unidic"]  # Japanese dictionary
+ko-dic = ["lindera-tokenizer/ko-dic"]  # Korean dictionary
+cc-cedict = ["lindera-tokenizer/cc-cedict"]  # Chinese dictionary
+ipadic-compress = ["lindera-tokenizer/ipadic-compress"]
+unidic-compress = ["lindera-tokenizer/unidic-compress"]
+ko-dic-compress = ["lindera-tokenizer/ko-dic-compress"]
+cc-cedict-compress = ["lindera-tokenizer/cc-cedict-compress"]
 
 [dependencies]
 tantivy = "0.19.2"
 
-lindera = "0.23.1"
+lindera-core = "0.24.0"
+lindera-dictionary = "0.24.0"
+lindera-tokenizer = "0.24.0"
 
 [dev-dependencies]
 criterion = { version = "0.4.0", features = ["html_reports"] }

diff --git a/README.md b/README.md
@@ -12,15 +12,11 @@ The following example enables IPADIC.
 
 ```
 [dependencies]
-lindera-tantivy = { version = "0.12.0", features = ["ipadic"] }
+lindera-core = "0.24.0"
+lindera-dictionary = "0.24.0"
+lindera-tantivy = { version = "0.24.0", features = ["ipadic"] }
 ```
 
-- ipadic: Japanese dictionary
-- unidic: Japanese dictionary
-- ko-dic: Korean dictionary
-- cc-cedict: Chinese dictionary
-
-
 ### Basic example
 
 ```rust
@@ -32,10 +28,9 @@ use tantivy::{
     Index,
 };
 
-use lindera_tantivy::{
-    dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
-    Mode,
-};
+use lindera_core::mode::Mode;
+use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+use lindera_tantivy::tokenizer::LinderaTokenizer;
 
 fn main() -> tantivy::Result<()> {
     // create schema builder

diff --git a/benches/bench.rs b/benches/bench.rs
@@ -7,6 +7,8 @@ fn bench_indexing(c: &mut Criterion) {
     use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
     use tantivy::Index;
 
+    use lindera_core::mode::Mode;
+    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
     use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
@@ -52,10 +54,15 @@ fn bench_indexing(c: &mut Criterion) {
         docs.push(doc);
     }
 
+    let dictionary_config = DictionaryConfig {
+        kind: Some(DictionaryKind::IPADIC),
+        path: None,
+    };
+    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+    let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
+
     // register Lindera tokenizer
-    index
-        .tokenizers()
-        .register("lang_ja", LinderaTokenizer::default());
+    index.tokenizers().register("lang_ja", tokenizer);
 
     // create index writer
     let mut index_writer = index.writer(50_000_000).unwrap();

diff --git a/examples/cc-cedict_example.rs → examples/cc-cedict.rs b/examples/cc-cedict_example.rs → examples/cc-cedict.rs
@@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
         Index,
     };
 
-    use lindera_tantivy::{
-        dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
-        Mode,
-    };
+    use lindera_core::mode::Mode;
+    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
     let mut schema_builder = Schema::builder();
@@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::CcCedict),
         path: None,
     };
-    let dictionary = load_dictionary(dictionary_config).unwrap();
+    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer

diff --git a/examples/ipadic_example.rs → examples/ipadic.rs b/examples/ipadic_example.rs → examples/ipadic.rs
@@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
         Index,
     };
 
-    use lindera_tantivy::{
-        dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
-        Mode,
-    };
+    use lindera_core::mode::Mode;
+    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
     let mut schema_builder = Schema::builder();
@@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::IPADIC),
         path: None,
     };
-    let dictionary = load_dictionary(dictionary_config).unwrap();
+    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer

diff --git a/examples/ko-dic_example.rs → examples/ko-dic.rs b/examples/ko-dic_example.rs → examples/ko-dic.rs
@@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
         Index,
     };
 
-    use lindera_tantivy::{
-        dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
-        Mode,
-    };
+    use lindera_core::mode::Mode;
+    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
     let mut schema_builder = Schema::builder();
@@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::KoDic),
         path: None,
     };
-    let dictionary = load_dictionary(dictionary_config).unwrap();
+    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer

diff --git a/examples/unidic_example.rs → examples/unidic.rs b/examples/unidic_example.rs → examples/unidic.rs
@@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
         Index,
     };
 
-    use lindera_tantivy::{
-        dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
-        Mode,
-    };
+    use lindera_core::mode::Mode;
+    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+    use lindera_tantivy::tokenizer::LinderaTokenizer;
 
     // create schema builder
     let mut schema_builder = Schema::builder();
@@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
         kind: Some(DictionaryKind::UniDic),
         path: None,
     };
-    let dictionary = load_dictionary(dictionary_config).unwrap();
+    let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
     let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
 
     // register Lindera tokenizer

diff --git a/src/dictionary.rs b/src/dictionary.rs
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,12 +1,2 @@
-pub mod dictionary;
 pub mod stream;
 pub mod tokenizer;
-
-pub type LinderaResult<T> = lindera::LinderaResult<T>;
-pub type Penalty = lindera::mode::Penalty;
-pub type Mode = lindera::mode::Mode;
-pub type DictionaryConfig = lindera::dictionary::DictionaryConfig;
-pub type UserDictionryConfig = lindera::dictionary::UserDictionaryConfig;
-pub type DictionaryKind = lindera::DictionaryKind;
-pub type Dictionary = lindera::Dictionary;
-pub type UserDictionary = lindera::UserDictionary;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -2,12 +2,13 @@ use std::collections::VecDeque;
 
 use tantivy::tokenizer::{BoxTokenStream, Token, Tokenizer};
 
-use lindera::tokenizer::Tokenizer as LTokenizer;
-
-use crate::{
-    dictionary::load_dictionary, stream::LinderaTokenStream, Dictionary, DictionaryConfig,
-    DictionaryKind, Mode, UserDictionary,
+use lindera_core::{
+    dictionary::{Dictionary, UserDictionary},
+    mode::Mode,
 };
+use lindera_tokenizer::tokenizer::Tokenizer as LTokenizer;
+
+use crate::stream::LinderaTokenStream;
 
 pub struct LinderaTokenizer {
     pub tokenizer: LTokenizer,
@@ -33,25 +34,6 @@ impl LinderaTokenizer {
     }
 }
 
-impl Default for LinderaTokenizer {
-    fn default() -> Self {
-        // Dictionary.
-        let dictionary = load_dictionary(DictionaryConfig {
-            kind: Some(DictionaryKind::IPADIC),
-            path: None,
-        })
-        .unwrap();
-
-        // User dictionary.
-        let user_dictionary = None;
-
-        // Mode.
-        let mode = Mode::Normal;
-
-        Self::new(dictionary, user_dictionary, mode)
-    }
-}
-
 impl Tokenizer for LinderaTokenizer {
     fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
         let tokens = match self.tokenizer.tokenize(text) {
@@ -77,6 +59,9 @@ impl Tokenizer for LinderaTokenizer {
 mod tests {
     use tantivy::tokenizer::{BoxTokenStream, Token, Tokenizer};
 
+    use lindera_core::mode::Mode;
+    use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
+
     use crate::tokenizer::LinderaTokenizer;
 
     fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
@@ -87,8 +72,14 @@ mod tests {
 
     #[test]
     fn test_tokenizer() {
-        let tokens =
-            test_helper(LinderaTokenizer::default().token_stream("すもももももももものうち"));
+        let dictionary_config = DictionaryConfig {
+            kind: Some(DictionaryKind::IPADIC),
+            path: None,
+        };
+        let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+        let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
+
+        let tokens = test_helper(tokenizer.token_stream("すもももももももものうち"));
         assert_eq!(tokens.len(), 7);
         {
             let token = &tokens[0];
@@ -150,9 +141,14 @@ mod tests {
 
     #[test]
     fn test_tokenizer_lindera() {
-        let tokens = test_helper(
-            LinderaTokenizer::default().token_stream("Linderaは形態素解析エンジンです。"),
-        );
+        let dictionary_config = DictionaryConfig {
+            kind: Some(DictionaryKind::IPADIC),
+            path: None,
+        };
+        let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
+        let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
+
+        let tokens = test_helper(tokenizer.token_stream("Linderaは形態素解析エンジンです。"));
         assert_eq!(tokens.len(), 7);
         {
             let token = &tokens[0];