-
Notifications
You must be signed in to change notification settings - Fork 10
/
compile_char_dict.py
135 lines (120 loc) · 3.86 KB
/
compile_char_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import re
import sys
import math
import random
import string
import textwrap
import itertools
import unicodedata
import collections
base_dir = 'wordlist'
char_list_files = {
'zhs': ('cn_tygfhzb.txt', 'cn_cyz3500.txt', 'cn_ywjy3500.txt', 'cn_gb2312.txt'),
'zht': ('tw_cygzb.txt', 'tw_ccygzb.txt', 'hk_cyzzxb.txt', 'tw_big5changyong.txt')
}
exclude_chars = set('氵冫忄阝刂亻扌犭纟糹牜礻衤訁讠釒钅飠饣丬艹宀冖覀罒罓灬爫丨丿乀乁乄乚丶丷亅丄丅丆乛囗')
if len(sys.argv) < 2 or sys.argv[1] == 'zhs':
lang = 'zhs'
other_lang = 'zht'
conv_file = 'TSCharacters.txt'
freq_file = 'charfreq_zhs.txt'
guarantee_list = (
('cn_tygfhzb.txt', 3500),
('cn_cyz3500.txt', 3500),
('cn_ywjy3500.txt', 3500),
('cn_gb2312.txt', 3755),
)
guarantee_list2 = 'cn_tygfhzb.txt'
guarantee_list2_conv = False
else:
lang = 'zht'
other_lang = 'zhs'
conv_file = 'STCharacters.txt'
freq_file = 'charfreq_zht.txt'
guarantee_list = (
('tw_cygzb.txt', 5000),
('hk_cyzzxb.txt', 5000),
('tw_big5changyong.txt', 5401)
)
guarantee_list2 = 'cn_tygfhzb.txt'
guarantee_list2_conv = True
if len(sys.argv) < 3 or sys.argv[2] == 'lstm':
char_num = 8000
else:
exclude_chars.update('彳亍')
char_num = 7000
char_lists = {}
for filenames in char_list_files.values():
for filename in filenames:
with open(os.path.join(base_dir, filename), 'r', encoding='utf-8') as f:
char_lists[filename] = [
unicodedata.normalize('NFKC', x.strip()) for x in f]
conv_table = {}
with open(os.path.join(base_dir, conv_file), 'r', encoding='utf-8') as f:
for ln in f:
key, values = ln.rstrip().split('\t', 1)
conv_table[key] = values.split()
char_set = collections.OrderedDict()
for filename in char_list_files[lang]:
with open(os.path.join(base_dir, filename), 'r', encoding='utf-8') as f:
char_set.update({unicodedata.normalize('NFKC', x.strip()): 0
for x in f})
for filename in char_list_files[other_lang]:
with open(os.path.join(base_dir, filename), 'r', encoding='utf-8') as f:
for ln in f:
ch = unicodedata.normalize('NFKC', ln.strip())
for conv_ch in conv_table.get(ch, (ch,)):
char_set[conv_ch] = 0
for ch in exclude_chars:
if ch in char_set:
del char_set[ch]
guarantee_chars = set()
for filename, num in guarantee_list:
guarantee_chars.update(char_lists[filename][:num])
if guarantee_list2_conv:
guarantee_chars2 = set()
for ch in char_lists[guarantee_list2]:
guarantee_chars2.update(conv_table.get(ch, (ch,)))
else:
guarantee_chars2 = set(char_lists[guarantee_list2])
total_freq = 0
with open(os.path.join(base_dir, freq_file), 'r', encoding='utf-8') as f:
for ln in f:
row = ln.strip().split()
if len(row) < 2:
continue
word, freq = row
if word in char_set:
freq = int(freq)
char_set[word] = freq
total_freq += freq
sorted_chars = sorted(char_set.items(), key=lambda x: (-x[1], x[0]))
all_chars = set(guarantee_chars)
acc_freq = 0
last_freq = None
status = 0
for ch, freq in sorted_chars:
if ord(ch) > 0xffff:
continue
elif freq == 0:
break
acc_freq += freq
if status == 0 and acc_freq / total_freq > 0.999:
status = 1
if status == 1 and freq != last_freq:
status = 2
if status == 2:
if len(all_chars) >= char_num:
status = 3
if ch not in guarantee_chars2:
continue
if status == 3 and freq != last_freq:
break
# print(ch, status, freq, acc_freq / total_freq, len(all_chars))
all_chars.add(ch)
last_freq = freq
for ch in sorted(all_chars, key=lambda x: (-char_set[x], x)):
print(ch)