-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_dacilin.py
129 lines (119 loc) · 5.44 KB
/
get_dacilin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import json
from tqdm import tqdm
LST_REL = ["别名", "别称", "简称", "同义词", "本名", "艺名", "又称", "又名", "全称", "全名", "其他名称", "中文学名",
"学名", "古称", "其他译名", "俗称", "旧称", "中文简称", "亦称", "昵称", "另名", "泛称", "其他称呼", "【别 名】",
"真名", "曾用名", "古时称", "笔名", "俗名", "也称", "又称为", "美称", "网名", "现名", "小名", "代称", "雅称",
"自称", "上古称谓", "现今称谓", "同类称谓", "中古称谓", "署名", "也称为", "通用名", "同名", "同义", "尊称",
"现称", "中文别名", "化学名", "病名", "同义词", "或称", "曾译名", "人称", "其他名字", "同称", "原称", "爱称",
"始称", "另称", "乃称", "改称", "原名", "又被称为", "称之为", "后世尊称", "谦称", "今名", "素称", "谐称",
"亦名", "号称", "译名", "曾称", "概念全称", "美名", "世称", "医学称呼", "一般称为", "化名", "同义旧称", "曾名",
"别名名称", "明代称", "清代称", "中文全名", "小名", "药名", "谥称", "简称", "更名后"]
LST_REL = list(set(LST_REL))
DICT_REL = {k: True for k in LST_REL}
def check_relation(load_path):
"""
观察大词林中出现的关系种类。
:param load_path: triple.txt, 大词林三元组数据集
:return: None, 以打印方式呈现
"""
with open(load_path, "r") as f:
lst_rel = list()
while True:
line = f.readline()
if not line:
break
line = line.strip()
line_split = line.split(";")
if len(line_split) == 3:
relation = line_split[1]
if "名" in relation or "称" in relation or "同义" in relation:
if relation not in lst_rel:
lst_rel.append(relation)
for rel in lst_rel:
print(rel)
return None
def clean_entity(s):
"""
清洗大词林中的实体,例如切分开多个并列的实体,去除多余的字或符号等。
:param s: string, 当前实体
:return: lst_s = [entity_1, entity_2, ...], 干净实体(可能有多个)
"""
lst_s = list()
s = s.lower()
s = re.sub("[\[((【]", "[", s)
if "[" in s:
s = s.split("[")[0]
s = re.sub("\s+", ",", s)
s = re.sub("[,,、;;./\\\\]", ",", s)
if re.search("[,]+", s) and "等" in s[-1]:
s = s.rstrip("等")
s = s.rstrip("……")
for w in s.split(","):
w = w.strip()
w = w.replace("“", "").replace("”", "")
if len(w) > 1: # delete word with single char
if re.search("[a-z]+", w):
if len(re.search("[a-z]+", w).group()) < len(w):
lst_s.append(w)
else:
lst_s.append(w)
return lst_s
def get_dacilin(load_path):
"""
根据大词林三元组数据生成名词的近/反义词的json数据集。
:param load_path: triple.txt, 大词林三元组数据集
:param save_path: corpus4n_cilin.json, 大词林名词近义/反义词数据集(其实没有反义词,但格式是统一的)
:return: None
"""
global DICT_REL
print("Find Synonym ...")
with open(load_path, "r") as f:
dict_synonym = dict()
while True:
line = f.readline()
if not line:
break
line = line.strip()
line_split = line.split(";")
if len(line_split) == 3:
head, relation, tail = line_split[0], line_split[1], line_split[2]
if len(relation) > 0 and len(head) > 0 and len(tail) > 0:
relation = re.sub("\s+", "", relation)
if relation in DICT_REL.keys():
heads = clean_entity(head)
tails = clean_entity(tail)
for h in heads:
for t in tails:
if (h not in t) and (t not in h):
if h in dict_synonym.keys():
if t not in dict_synonym[h]:
dict_synonym[h].append(t)
else:
dict_synonym[h] = [t]
# format finetune
print("Format Fine-tune ...")
d_n_new = dict()
for k, lst_v in tqdm(dict_synonym.items()):
if k not in d_n_new.keys():
d_n_new[k] = {"synonym": lst_v, "antonym": list()}
else:
d_n_new[k]["synonym"] += lst_v
d_n_new[k]["synonym"] = list(set(d_n_new[k]["synonym"]))
for v in lst_v:
lst_v_neighbor = [neighbor for neighbor in lst_v if neighbor != v]
lst_v_neighbor.append(k)
lst_v_neighbor = list(set(lst_v_neighbor))
if v not in d_n_new.keys():
d_n_new[v] = {"synonym": lst_v_neighbor, "antonym": list()}
else:
d_n_new[v]["synonym"] += lst_v_neighbor
d_n_new[v]["synonym"] = list(set(d_n_new[v]["synonym"]))
# save
print("Save data...")
with open("corpus4n_cilin.json", "w") as f:
json.dump(d_n_new, f, ensure_ascii=False, indent=2)
return None
if __name__ in "__main__":
# check_relation(load_path="word_dacilin/triple.txt")
get_dacilin(load_path="word_dacilin/triple.txt")