-
Notifications
You must be signed in to change notification settings - Fork 6
/
process-Google-extracted-pinyins.py
executable file
·59 lines (44 loc) · 1.41 KB
/
process-Google-extracted-pinyins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# Extract exacts pinyins of chars
# Output text file of pinyins
# Format of input file:
# =====================
# 01 pinyin 1.char (pinyin) 2.char (pinyin) .... 6.char (pinyin)
# Much of this info can be ignored. We just need the chars and the pinyins.
import codecs
import re
# output file
f2 = codecs.open("exact-Google-pinyins.txt", encoding="UTF-8", mode='w')
dict = {}
pattern = re.compile('[0-9]\. (\w+) \(([^\)]*)\)')
for i in ["1", "2", "3"]:
# open input file
f = codecs.open("web/scraped-Google-" + i + ".txt", encoding="UTF-8", mode='r')
for line in f:
print ("\033[0;34m", line, "\033[0m")
# * The following fails because only the last item in a ()+ pattern is captured:
# pinyins = re.match('[0-9][0-9] [a-z]+ (([0-9]\.(\w+) \(([^\)]*)\)[ |\n])+)', line)
pinyins = pattern.findall(line)
# print(pinyins)
# **** Add pinyin to memory:
for p in pinyins:
print(p[0], p[1])
if not p[0] in dict:
dict.setdefault(p[0], []) # each dict's value is a list
dict[p[0]].append(p[1])
elif p[1] in dict[p[0]]:
pass
else:
dict[p[0]].append(p[1])
# **** multiple pinyins:
# print(p[0], p[1], end='')
# print("\033[0;31m Error: ", end='')
# print(dict[p[0]], "\033[0m")
f.close()
# print(dict)
# **** Output char list sorted by freq:
for char in dict:
for pinyin in dict[char]:
f2.write(char + pinyin + '\n')
f2.close()