Skip to content

Commit

Permalink
Merge pull request #24 from fetchadd/master
Browse files Browse the repository at this point in the history
Add Chinese support
  • Loading branch information
mattico authored Oct 22, 2019
2 parents ccbf5bd + 9396bc5 commit 63ac832
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 8 deletions.
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ Cargo.lock

# These are backup files generated by rustfmt
**/*.rs.bk

examples/out.json
out.json

**/node_modules/
**/node_modules/
.idea/
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ serde_derive = "1.0.34" # First verstion to support #[serde(flatten)]
serde_json = "1"
strum = "0.15"
strum_macros = "0.15"
jieba-rs = "0.4.10"

[features]
default = ["languages"]
nightly = ["bench"]
bench = []

languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr"]
languages = ["da", "de", "du", "es", "fi", "fr", "it", "pt", "ro", "ru", "sv", "tr", "zh"]
da = ["rust-stemmers"]
de = ["rust-stemmers"]
du = ["rust-stemmers"]
Expand All @@ -47,3 +48,4 @@ ro = ["rust-stemmers"]
ru = ["rust-stemmers"]
sv = ["rust-stemmers"]
tr = ["rust-stemmers"]
zh = ["rust-stemmers"]
13 changes: 12 additions & 1 deletion src/lang/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ macro_rules! make_stemmer {
}

/// Used to configure the `Index` for a specific lanugage.
#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter)]
#[derive(Copy, Clone, Eq, PartialEq, Debug, EnumString, ToString, EnumIter, Serialize, Deserialize)]
pub enum Language {
English,
#[cfg(feature = "da")]
Expand All @@ -81,6 +81,8 @@ pub enum Language {
Swedish,
#[cfg(feature = "tr")]
Turkish,
#[cfg(feature = "zh")]
Chinese,
#[doc(hidden)]
#[strum(disabled = "true")]
__NonExhaustive,
Expand Down Expand Up @@ -123,6 +125,8 @@ impl Language {
"sv" => Some(Language::Swedish),
#[cfg(feature = "tr")]
"tr" => Some(Language::Turkish),
#[cfg(feature = "zh")]
"zh" => Some(Language::Chinese),
_ => None,
}
}
Expand Down Expand Up @@ -162,6 +166,8 @@ impl Language {
Language::Swedish => "sv",
#[cfg(feature = "tr")]
Language::Turkish => "tr",
#[cfg(feature = "zh")]
Language::Chinese => "zh",
_ => panic!("Don't use the __NonExhaustive variant!"),
}
}
Expand Down Expand Up @@ -194,11 +200,14 @@ impl Language {
Language::Swedish => ::lang::sv::make_pipeline(),
#[cfg(feature = "tr")]
Language::Turkish => ::lang::tr::make_pipeline(),
#[cfg(feature = "zh")]
Language::Chinese => ::lang::zh::make_pipeline(),
_ => panic!("Dont use the `__NonExhaustive` variant!"),
}
}
}


pub mod en;

#[cfg(feature = "da")]
Expand All @@ -225,3 +234,5 @@ pub mod ru;
pub mod sv;
#[cfg(feature = "tr")]
pub mod tr;
#[cfg(feature = "zh")]
pub mod zh;
50 changes: 50 additions & 0 deletions src/lang/zh.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use pipeline::Pipeline;


pub fn make_pipeline() -> Pipeline {
Pipeline {
queue: vec![
("trimmer-zh".into(), trimmer),
("stopWordFilter-zh".into(), stop_word_filter),
("stemmer-zh".into(), stemmer),
],
}
}


pub fn trimmer(token: String) -> Option<String> {
let ret: String = token.
trim_matches(|c: char| !is_valid_char(c) )
.into();

if ret.eq("") {
return None;
}

Some(ret)
}

make_stop_word_filter!([
"的", "了"
]);

fn stemmer(token: String) -> Option<String> {
Some(token)
}

fn is_valid_char(c: char) -> bool {
let min_max_list = [
[19668, 40869], // min and max Chinese char
['a' as u32, 'z' as u32],
['A' as u32, 'Z' as u32]
];

let c = c as u32;
for min_max in min_max_list.iter() {
if c >= min_max[0] && c <= min_max[1] {
return true;
}
}

false
}
22 changes: 21 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ extern crate rust_stemmers;
#[cfg(test)]
#[macro_use]
extern crate maplit;
#[cfg(feature = "zh")]
extern crate jieba_rs;


/// The version of elasticlunr.js this library was designed for.
pub const ELASTICLUNR_VERSION: &str = "0.9.5";
Expand Down Expand Up @@ -149,6 +152,7 @@ impl IndexBuilder {
document_store: DocumentStore::new(self.save),
pipeline: self.pipeline.unwrap_or_default(),
version: ::ELASTICLUNR_VERSION,
lang: Language::English,
}
}
}
Expand All @@ -165,6 +169,7 @@ pub struct Index {
pub version: &'static str,
index: BTreeMap<String, InvertedIndex>,
pub document_store: DocumentStore,
lang: Language,
}

impl Index {
Expand Down Expand Up @@ -226,6 +231,7 @@ impl Index {
ref_field: "id".into(),
version: ::ELASTICLUNR_VERSION,
document_store: DocumentStore::new(true),
lang: lang,
}
}

Expand Down Expand Up @@ -256,7 +262,20 @@ impl Index {
continue;
}

let tokens = self.pipeline.run(pipeline::tokenize(value.as_ref()));
let raw_tokens: Vec<String>;

match self.lang {
#[cfg(feature = "zh")]
Language::Chinese => {
raw_tokens = pipeline::tokenize_chinese(value.as_ref());
},
_ => {
raw_tokens = pipeline::tokenize(value.as_ref());
}
}

let tokens = self.pipeline.run(raw_tokens);

self.document_store
.add_field_length(doc_ref, field, tokens.len());

Expand All @@ -266,6 +285,7 @@ impl Index {

for (token, count) in &token_freq {
let freq = (*count as f64).sqrt();

self.index
.get_mut(field)
.expect(&format!("InvertedIndex does not exist for field {}", field))
Expand Down
13 changes: 13 additions & 0 deletions src/pipeline.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
//! Defines the pipeline which processes text for inclusion in the index. Most users do not need
//! to use this module directly.

use serde::ser::{Serialize, SerializeSeq, Serializer};
#[cfg(feature = "zh")]
use jieba_rs::Jieba;

/// Splits a text string into a vector of individual tokens.
pub fn tokenize(text: &str) -> Vec<String> {
Expand All @@ -11,6 +14,16 @@ pub fn tokenize(text: &str) -> Vec<String> {
.collect()
}

#[cfg(feature = "zh")]
pub fn tokenize_chinese(text: &str) -> Vec<String> {
let jieba = Jieba::new();

jieba.cut_for_search(text.as_ref(), false)
.iter()
.map(|s| (*s).into())
.collect()
}

/// The function type used for each step in a pipeline.
pub type PipelineFn = fn(String) -> Option<String>;

Expand Down
1 change: 1 addition & 0 deletions tests/data/zh.in.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
这条法国邮船白拉日隆子爵号(VicomtedeBragelonne)正向中国开来。早晨八点多钟,冲洗过的三等舱甲板湿意未干,但已坐满了人,法国人、德国流亡出来的犹太人、印度人、安南人,不用说还有中国人。海风里早含着燥热,胖人身体给炎风吹干了,上一层汗结的盐霜,仿佛刚在巴勒斯坦的死海里洗过澡。毕竟是清晨,人的兴致还没给太阳晒萎,烘懒,说话做事都很起劲。那几个新派到安南或中国租界当警察的法国人,正围了那年轻善撒娇的犹太女人在调情。俾斯麦曾说过,法国公使大使的特点,就是一句外国话不会讲;这几位警察并不懂德文,居然传情达意,引得犹太女人格格地笑,比他们的外交官强多了。这女人的漂亮丈夫,在旁顾而乐之,因为他几天来,香烟、啤酒、柠檬水沾光了不少。红海已过,不怕热极引火,所以等一会甲板上零星果皮、纸片、瓶塞之外,香烟头定又遍处皆是。法国人的思想是有名的清楚,他的文章也明白干净,但是他的做事,无不混乱、肮脏、喧哗,但看这船上的乱糟糟。这船,倚仗人的机巧,载满人的扰攘,寄满人的希望,热闹地行着,每分钟把沾污了人气的一小方小面,还给那无情、无尽、无际的大海。
Loading

0 comments on commit 63ac832

Please sign in to comment.