Skip to content

Commit

Permalink
Update Lindera (#76)
Browse files Browse the repository at this point in the history
* Update Lindear

* Fix format

* Update CHANGES.md
  • Loading branch information
mosuka authored Apr 30, 2023
1 parent 1e2a8b2 commit 90b9164
Show file tree
Hide file tree
Showing 11 changed files with 73 additions and 89 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## 0.24.0 (2023-04-40)
- Update Lindera #76 @mosuka

## 0.23.1 (2023-04-07)
- Update Lindera #74 @mosuka

Expand Down
24 changes: 13 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lindera-tantivy"
version = "0.23.1"
version = "0.24.0"
edition = "2021"
description = "Lindera Tokenizer for Tantivy."
documentation = "https://docs.rs/lindera-tantivy"
Expand All @@ -12,20 +12,22 @@ categories = ["text-processing"]
license = "MIT"

[features]
default = ["ipadic"] # Japanese dictionary
ipadic = ["lindera/ipadic"] # Japanese dictionary
unidic = ["lindera/unidic"] # Japanese dictionary
ko-dic = ["lindera/ko-dic"] # Korean dictionary
cc-cedict = ["lindera/cc-cedict"] # Chinese dictionary
ipadic-compress = ["lindera/ipadic-compress"]
unidic-compress = ["lindera/unidic-compress"]
ko-dic-compress = ["lindera/ko-dic-compress"]
cc-cedict-compress = ["lindera/cc-cedict-compress"]
default = [] # Japanese dictionary
ipadic = ["lindera-tokenizer/ipadic"] # Japanese dictionary
unidic = ["lindera-tokenizer/unidic"] # Japanese dictionary
ko-dic = ["lindera-tokenizer/ko-dic"] # Korean dictionary
cc-cedict = ["lindera-tokenizer/cc-cedict"] # Chinese dictionary
ipadic-compress = ["lindera-tokenizer/ipadic-compress"]
unidic-compress = ["lindera-tokenizer/unidic-compress"]
ko-dic-compress = ["lindera-tokenizer/ko-dic-compress"]
cc-cedict-compress = ["lindera-tokenizer/cc-cedict-compress"]

[dependencies]
tantivy = "0.19.2"

lindera = "0.23.1"
lindera-core = "0.24.0"
lindera-dictionary = "0.24.0"
lindera-tokenizer = "0.24.0"

[dev-dependencies]
criterion = { version = "0.4.0", features = ["html_reports"] }
Expand Down
17 changes: 6 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,11 @@ The following example enables IPADIC.

```
[dependencies]
lindera-tantivy = { version = "0.12.0", features = ["ipadic"] }
lindera-core = "0.24.0"
lindera-dictionary = "0.24.0"
lindera-tantivy = { version = "0.24.0", features = ["ipadic"] }
```

- ipadic: Japanese dictionary
- unidic: Japanese dictionary
- ko-dic: Korean dictionary
- cc-cedict: Chinese dictionary


### Basic example

```rust
Expand All @@ -32,10 +28,9 @@ use tantivy::{
Index,
};

use lindera_tantivy::{
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};
use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

fn main() -> tantivy::Result<()> {
// create schema builder
Expand Down
13 changes: 10 additions & 3 deletions benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ fn bench_indexing(c: &mut Criterion) {
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions};
use tantivy::Index;

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -52,10 +54,15 @@ fn bench_indexing(c: &mut Criterion) {
docs.push(doc);
}

let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
index
.tokenizers()
.register("lang_ja", LinderaTokenizer::default());
index.tokenizers().register("lang_ja", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000).unwrap();
Expand Down
9 changes: 4 additions & 5 deletions examples/cc-cedict_example.rs → examples/cc-cedict.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera_tantivy::{
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};
use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::CcCedict),
path: None,
};
let dictionary = load_dictionary(dictionary_config).unwrap();
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down
9 changes: 4 additions & 5 deletions examples/ipadic_example.rs → examples/ipadic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera_tantivy::{
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};
use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary(dictionary_config).unwrap();
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down
9 changes: 4 additions & 5 deletions examples/ko-dic_example.rs → examples/ko-dic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera_tantivy::{
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};
use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::KoDic),
path: None,
};
let dictionary = load_dictionary(dictionary_config).unwrap();
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down
9 changes: 4 additions & 5 deletions examples/unidic_example.rs → examples/unidic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@ fn main() -> tantivy::Result<()> {
Index,
};

use lindera_tantivy::{
dictionary::load_dictionary, tokenizer::LinderaTokenizer, DictionaryConfig, DictionaryKind,
Mode,
};
use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -63,7 +62,7 @@ fn main() -> tantivy::Result<()> {
kind: Some(DictionaryKind::UniDic),
path: None,
};
let dictionary = load_dictionary(dictionary_config).unwrap();
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

// register Lindera tokenizer
Expand Down
5 changes: 0 additions & 5 deletions src/dictionary.rs

This file was deleted.

10 changes: 0 additions & 10 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,2 @@
pub mod dictionary;
pub mod stream;
pub mod tokenizer;

pub type LinderaResult<T> = lindera::LinderaResult<T>;
pub type Penalty = lindera::mode::Penalty;
pub type Mode = lindera::mode::Mode;
pub type DictionaryConfig = lindera::dictionary::DictionaryConfig;
pub type UserDictionryConfig = lindera::dictionary::UserDictionaryConfig;
pub type DictionaryKind = lindera::DictionaryKind;
pub type Dictionary = lindera::Dictionary;
pub type UserDictionary = lindera::UserDictionary;
54 changes: 25 additions & 29 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ use std::collections::VecDeque;

use tantivy::tokenizer::{BoxTokenStream, Token, Tokenizer};

use lindera::tokenizer::Tokenizer as LTokenizer;

use crate::{
dictionary::load_dictionary, stream::LinderaTokenStream, Dictionary, DictionaryConfig,
DictionaryKind, Mode, UserDictionary,
use lindera_core::{
dictionary::{Dictionary, UserDictionary},
mode::Mode,
};
use lindera_tokenizer::tokenizer::Tokenizer as LTokenizer;

use crate::stream::LinderaTokenStream;

pub struct LinderaTokenizer {
pub tokenizer: LTokenizer,
Expand All @@ -33,25 +34,6 @@ impl LinderaTokenizer {
}
}

impl Default for LinderaTokenizer {
fn default() -> Self {
// Dictionary.
let dictionary = load_dictionary(DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
})
.unwrap();

// User dictionary.
let user_dictionary = None;

// Mode.
let mode = Mode::Normal;

Self::new(dictionary, user_dictionary, mode)
}
}

impl Tokenizer for LinderaTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
let tokens = match self.tokenizer.tokenize(text) {
Expand All @@ -77,6 +59,9 @@ impl Tokenizer for LinderaTokenizer {
mod tests {
use tantivy::tokenizer::{BoxTokenStream, Token, Tokenizer};

use lindera_core::mode::Mode;
use lindera_dictionary::{load_dictionary_from_config, DictionaryConfig, DictionaryKind};

use crate::tokenizer::LinderaTokenizer;

fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
Expand All @@ -87,8 +72,14 @@ mod tests {

#[test]
fn test_tokenizer() {
let tokens =
test_helper(LinderaTokenizer::default().token_stream("すもももももももものうち"));
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

let tokens = test_helper(tokenizer.token_stream("すもももももももものうち"));
assert_eq!(tokens.len(), 7);
{
let token = &tokens[0];
Expand Down Expand Up @@ -150,9 +141,14 @@ mod tests {

#[test]
fn test_tokenizer_lindera() {
let tokens = test_helper(
LinderaTokenizer::default().token_stream("Linderaは形態素解析エンジンです。"),
);
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);

let tokens = test_helper(tokenizer.token_stream("Linderaは形態素解析エンジンです。"));
assert_eq!(tokens.len(), 7);
{
let token = &tokens[0];
Expand Down

0 comments on commit 90b9164

Please sign in to comment.