-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #24 from fetchadd/master
Add Chinese support
- Loading branch information
Showing
11 changed files
with
365 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
use pipeline::Pipeline; | ||
|
||
|
||
pub fn make_pipeline() -> Pipeline { | ||
Pipeline { | ||
queue: vec![ | ||
("trimmer-zh".into(), trimmer), | ||
("stopWordFilter-zh".into(), stop_word_filter), | ||
("stemmer-zh".into(), stemmer), | ||
], | ||
} | ||
} | ||
|
||
|
||
pub fn trimmer(token: String) -> Option<String> { | ||
let ret: String = token. | ||
trim_matches(|c: char| !is_valid_char(c) ) | ||
.into(); | ||
|
||
if ret.eq("") { | ||
return None; | ||
} | ||
|
||
Some(ret) | ||
} | ||
|
||
make_stop_word_filter!([ | ||
"的", "了" | ||
]); | ||
|
||
fn stemmer(token: String) -> Option<String> { | ||
Some(token) | ||
} | ||
|
||
fn is_valid_char(c: char) -> bool { | ||
let min_max_list = [ | ||
[19668, 40869], // min and max Chinese char | ||
['a' as u32, 'z' as u32], | ||
['A' as u32, 'Z' as u32] | ||
]; | ||
|
||
let c = c as u32; | ||
for min_max in min_max_list.iter() { | ||
if c >= min_max[0] && c <= min_max[1] { | ||
return true; | ||
} | ||
} | ||
|
||
false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
这条法国邮船白拉日隆子爵号(VicomtedeBragelonne)正向中国开来。早晨八点多钟,冲洗过的三等舱甲板湿意未干,但已坐满了人,法国人、德国流亡出来的犹太人、印度人、安南人,不用说还有中国人。海风里早含着燥热,胖人身体给炎风吹干了,上一层汗结的盐霜,仿佛刚在巴勒斯坦的死海里洗过澡。毕竟是清晨,人的兴致还没给太阳晒萎,烘懒,说话做事都很起劲。那几个新派到安南或中国租界当警察的法国人,正围了那年轻善撒娇的犹太女人在调情。俾斯麦曾说过,法国公使大使的特点,就是一句外国话不会讲;这几位警察并不懂德文,居然传情达意,引得犹太女人格格地笑,比他们的外交官强多了。这女人的漂亮丈夫,在旁顾而乐之,因为他几天来,香烟、啤酒、柠檬水沾光了不少。红海已过,不怕热极引火,所以等一会甲板上零星果皮、纸片、瓶塞之外,香烟头定又遍处皆是。法国人的思想是有名的清楚,他的文章也明白干净,但是他的做事,无不混乱、肮脏、喧哗,但看这船上的乱糟糟。这船,倚仗人的机巧,载满人的扰攘,寄满人的希望,热闹地行着,每分钟把沾污了人气的一小方小面,还给那无情、无尽、无际的大海。 |
Oops, something went wrong.