Skip to content

Commit

Permalink
加入符号词汇
Browse files Browse the repository at this point in the history
  • Loading branch information
fkxxyz committed Jun 13, 2020
1 parent ebf023f commit 86598be
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
17 changes: 10 additions & 7 deletions build
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,23 @@ extract 360万中文词库+词性+词频.zip || exit
ln -sf rime-essay/essay.txt essay.txt || exit
ln -sf rime-pinyin-simp/pinyin_simp.dict.yaml pinyin_simp.dict.yaml || exit

# 开始生成词典
../src/clover-dict-gen --minfreq=$minfreq || exit
for i in THUOCL/data/THUOCL_*; do
echo "转换 $i"
../src/thuocl2rime $i || exit
done

# 生成符号列表
cd rime-symbols || exit
mkdir -p opencc || exit
cd opencc || exit
../rime-symbols-gen || exit
cd ../.. || exit

# 生成符号词汇
cat */opencc/*.txt | opencc -c t2s.json | uniq > symbols.txt

# 开始生成词典
../src/clover-dict-gen --minfreq=$minfreq || exit
for i in THUOCL/data/THUOCL_*; do
echo "转换 $i"
../src/thuocl2rime $i || exit
done

# 生成 data 目录
mkdir -p ../data || exit
cp ../src/*.yaml ../data || exit
Expand Down
10 changes: 10 additions & 0 deletions src/clover-dict-gen
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,16 @@ def main(args):
100000, PrintProcess('正在合并袖珍简化字拼音的词库 (%s/%s)').process)
print('成功合并袖珍简化字拼音 %s 个汉字, %s 个词组。' % r)

# 合并转换符号词汇
r = generator.mergeDict(
open('symbols.txt',
'r',
encoding = 'utf-8').read(),
10000,
0,
100000, PrintProcess('正在合并符号词汇 (%s/%s)').process)
print('成功合并符号词汇 %s 个汉字, %s 个词组。' % r)

word_dict_name = 'clover.base'
parse_dict_name = 'clover.phrase'

Expand Down

0 comments on commit 86598be

Please sign in to comment.