-
Notifications
You must be signed in to change notification settings - Fork 4
/
candidate_phrase.py
285 lines (260 loc) · 13.3 KB
/
candidate_phrase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import os
import csv
from pyltp import SentenceSplitter
from pyltp import Segmentor
from pyltp import Postagger
from pyltp import Parser
import re
import sys
content_index = 5
end_str_list = [
'与我们交流', '联系我们', '欢迎联系', '本次更新', '官方网站', '客服电话', '客服热线', '全国热线', '官方微信', 'http',
'www', '意见反馈。', '联系方式。', '更新内容:', '微博:', 'QQ群:', '官方客服', '邮箱:', '微信公众号:', '帮助与反馈',
'官方微博:', '官网:', '全国热线', '热线:', '微信公众平台:', '近期更新:', '联系我们:', '连续包月', '订阅服务'
]
VOB_SBV_COO_str_list = [
'是', '为', '免', '无需', '在', '达', '无', '等', '愿', '让', '求', '例如', '带', '多', '荣获', '落', '跨', '包',
'用', '帮', '开通', '上线', '运用', '融入', '没有', '涵盖', '包括', '覆盖', '知''方便', '集成', '确保', '预设',
'包含', '云集', '超过', '超', '靠', '给', '成', '变', '欢迎', '贯穿', '起', '没', '掉', '破', '出品', '想', '讲',
'说', '集', '小', '有', '拥有', '到', '关爱', '结合', '来', '爱', '还有', '及时达', '当', '变化', '背书', '告别',
'无惧', '力求', '按', '吃', '满足', '助', '搜罗', '解放', '趴', '希望', '包括', '保', '互通', '请', '推出', '发育',
'孕育', '还是', '怀', '成为', '见证', '翻开', '翻', '遍布', '形成', '扩大', '致力', '拒绝', '沉淀', '耕耘', '通过',
'享受', '懂', '变', '做到'
]
individual_feature_words = [
'登录', '注册', '摇一摇', '扫一扫', '朋友圈', '直播', '搜索', '电话', '短信', '打卡', '审批', '汇报', '单聊', '群聊',
'收款', '红包', '预约', '预订', '付款', '打车', '支付', '定位', '导航', '分享', '打车', '下单', '外卖', '改签', '退票',
'打车', '补票', '抢票', '打车', '聊天', '购物', '转账', '信用住', '听说读写', '音标', '发音', '词性', '释义', '签到',
'用法', '搭配', '换肤', '答题', '已读未读'
]
class CandidatePhraseExtractor:
def __init__(self, input_file, output_file, pyltp_path=None):
# ltp model files
LTP_DATA_DIR = 'pyltp-resource/ltp-model' if pyltp_path is None else pyltp_path
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')
cut_text_path = os.path.join(LTP_DATA_DIR, 'word_segmentation.txt')
self.input_file = input_file
self.output_file = output_file
self.raw_data = []
self.sentence_cutted = [] # 分词后list,每一个子元素为一个app描述分词后的list
self.postagger = Postagger() # 初始化词性标注实例
self.postagger.load(pos_model_path) # 加载词性标注模型
self.parser = Parser() # 初始化句法分析实例
self.parser.load(par_model_path) # 加载句法分析模型
self.segmentor = Segmentor()
self.segmentor.load_with_lexicon(cws_model_path, cut_text_path)
@staticmethod
def remove_serial_number(text):
text = re.sub('\d[,.、]', '。', text)
text = re.sub('[●,=√#¥%&,【】◆ ★◎☆⊙]', '。', text)
return text
@staticmethod
def remove_exce_punc(text):
duels = [x + x for x in list('。,,=!!-—#')]
for d in duels:
while d in text:
text = text.replace(d, d[0])
return text
def read_data_from_file(self):
with open(self.input_file, encoding='utf-8') as f:
reader = csv.reader(f)
for row in reader:
self.raw_data.append(row)
def get_seg_sentence_from_pyltp(self):
for sentence in self.raw_data:
# 去除作为编号的字符
sent_temp = self.remove_serial_number(sentence[content_index])
sent_temp_2 = self.remove_exce_punc(sent_temp)
sents = SentenceSplitter.split(sent_temp_2)
temp_list = [] # 保存一个app描述的处理结果
for sent in sents: # 一个app描述中分句后的所有句子
# 判断app描述中是否有句子包含end_str, 若有,直接跳过该app描述中该句子之后的所有内容
skip_sent = False
for end_str in end_str_list:
if end_str in sent:
skip_sent = True
break
if skip_sent is True and len(sent) < 400: # len(sent) < 400:由于分句错误而将整个app评视为一句
break
# 对每个句子进行分词
words = self.segmentor.segment(sent)
temp_list.append(list(words)) # list(words) 为一句话的分词结果 ['应用', '介绍', ':', ',', '1.', '可以', '发'...]
self.sentence_cutted.append(temp_list) # 每个子list为一个app描述的处理结果
self.segmentor.release()
def get_postage_and_parser(self):
function_phrase = [] # 记录所有候选功能短语的list
for sentences in self.sentence_cutted:
app_phrase = [] # 保存一个app中候选功能短语的list
for sentence in sentences:
if sentence[-1] in ['。', '!', '?']:
sentence = sentence[0:-1]
postags = self.postagger.postag(sentence) # 词性标注
postage_list = list(postags)
arcs = self.parser.parse(sentence, postage_list) # 句法分析
S = set()
sent_phrase = []
VOB_exist = False
for i in range(len(arcs)):
# if arcs[i].relation == 'ATT':
# ATT_exist = True
if arcs[i].relation == 'VOB':
VOB_exist = True
# ATT_object_index_max = 0
# if ATT_exist:
# # ATT_object_index_max = 0
# for i in range(len(arcs)):
# if arcs[i].relation == 'ATT':
# if ATT_object_index_max < arcs[i].head - 1:
# ATT_object_index_max = arcs[i].head - 1
for i in range(len(arcs)):
# pattern 10
if sentence[i] in individual_feature_words: # 单个能表示功能的词
app_phrase.append(sentence[i])
if postage_list[i] == 'wp':
continue
temp = arcs[i].head - 1
str1 = sentence[i]
str2 = sentence[temp]
temp2 = arcs[temp].head - 1
str3 = sentence[temp2]
if not VOB_exist: # and not COO_exist
if arcs[i].relation == 'ATT':
if sentence[temp] not in S: # 控制每个句子中被ATT修饰的, 相同sentence[temp]只输出一次
S.add(sentence[temp])
else:
continue
if sentence[i] not in S: # 控制每个句子中每个ATT只输出一次
S.add(sentence[i])
else:
continue
# 这里输出所有与sentence[temp]具有定中关系的修饰词及sentence[temp]
# pattern 2
for j in range(len(arcs)):
if j < i:
continue
elif arcs[j].relation == 'ATT' and arcs[j].head - 1 == temp:
# 修饰当前 ATT 的 ATT
if arcs[j - 2].relation == 'ATT' and arcs[j - 2].head - 1 == j:
sent_phrase.append(sentence[j - 2])
if arcs[j - 1].relation == 'ATT' and arcs[j - 1].head - 1 == j:
sent_phrase.append(sentence[j - 1])
sent_phrase.append(sentence[j]) # 当前ATT
sent_phrase.append(str2)
if arcs[temp].relation == 'ATT' or arcs[
temp].relation == 'ADV': # 安全 管理 企业 数据 ,安全管理为指向 企业ATT,而企业指向 数据
sent_phrase.append(str3)
app_phrase.append(sent_phrase)
sent_phrase = []
# pattern 1 and pattern 3
# different patterns can be extracted in the same way
if arcs[i].relation == 'VOB': # 输出语法依存分析中 动宾 关系
if str2 in VOB_SBV_COO_str_list:
continue
sent_phrase.append(str2)
for j in range(len(arcs)):
if arcs[j].relation == 'ATT' and arcs[j].head - 1 == i: # 该词用以修饰宾语,如 发文字消息 中的文字
if arcs[j - 2].relation == 'ATT' and arcs[j - 2].head - 1 == j:
sent_phrase.append(sentence[j - 2])
if arcs[j - 1].relation == 'ATT' and arcs[j - 1].head - 1 == j:
sent_phrase.append(sentence[j - 1])
sent_phrase.append(sentence[j])
sent_phrase.append(str1)
app_phrase.append(sent_phrase)
sent_phrase = []
# supplement for pattern 1 and pattern 3
# different patterns can be extracted in the same way
elif arcs[i].relation == 'SBV': # 输出语法依存分析中 主谓 关系, 由于分词、词性标注、句法分析的错误而必须考虑
if str2 in VOB_SBV_COO_str_list:
continue
for j in range(len(arcs)):
if arcs[j].relation == 'ATT' and arcs[j].head - 1 == i: # 该词用以修饰 谓语
if arcs[j - 2].relation == 'ATT' and arcs[j - 2].head - 1 == j:
sent_phrase.append(sentence[j - 2])
if arcs[j - 1].relation == 'ATT' and arcs[j - 1].head - 1 == j:
sent_phrase.append(sentence[j - 1])
sent_phrase.append(sentence[j])
for k in range(i, temp + 1): # 输出i到temp之间的所有词
sent_phrase.append(sentence[k])
sent_phrase = []
# pattern 5 and pattern 6 and pattern 8 and pattern 9
# different patterns can be extracted in the same way
elif arcs[i].relation == 'COO': # 输出语法依存分析中 并列 关系
if arcs[temp].relation == 'VOB':
if sentence[temp2] in VOB_SBV_COO_str_list:
continue
sent_phrase.append(sentence[temp2])
for j in range(len(arcs)):
if arcs[j].relation == 'ATT' and arcs[j].head - 1 == i:
sent_phrase.append(sentence[j])
sent_phrase.append(sentence[i])
app_phrase.append(sent_phrase)
sent_phrase = []
# pattern 4
elif arcs[i].relation == 'FOB': # 输出语法依存分析中 前置宾语 关系
sent_phrase.append(str2)
for j in range(len(arcs)):
if arcs[j].relation == 'ATT' and arcs[j].head - 1 == i:
sent_phrase.append(sentence[j])
if arcs[j].relation == 'ATT' and arcs[j].head - 1 == temp:
sent_phrase.append(sentence[j])
sent_phrase.append(str1)
app_phrase.append(sent_phrase)
sent_phrase = []
else: # DBL、RAD、HED ...
for j in range(len(arcs)):
if arcs[j].relation == 'FOB' and arcs[j].head - 1 == i:
sent_phrase.append(sentence[temp])
for k in range(len(arcs)):
if arcs[k].relation == 'ATT' and arcs[k].head - 1 == j: # 该词用以修饰宾语,如 发文字消息 中的文字
sent_phrase.append(sentence[k])
sent_phrase.append(sentence[j])
app_phrase.append(sent_phrase)
sent_phrase = []
# pattern 7
elif arcs[j].relation == 'VOB' and arcs[j].head - 1 == i:
if arcs[i - 1].relation == ('WP' or 'ADV'):
continue
sent_phrase.append(sentence[i])
for k in range(len(arcs)):
if arcs[k].relation == 'ATT' and arcs[k].head - 1 == j: # 该词用以修饰宾语
sent_phrase.append(sentence[k])
sent_phrase.append(sentence[j])
app_phrase.append(sent_phrase)
sent_phrase = []
if arcs[i - 1].relation == 'ATT' and arcs[i - 1].head == i + 1:
for j in range(len(arcs)):
if arcs[j].relation == 'ATT' and arcs[j].head - 1 == i - 1: # 该词用以修饰当前词
sent_phrase.append(sentence[j])
sent_phrase.append(sentence[i - 1])
sent_phrase.append(str1)
app_phrase.append(sent_phrase)
sent_phrase = []
S.clear()
function_phrase.append(app_phrase)
self.postagger.release()
self.parser.release()
# 去除重复项
final_function_phrase = []
for app_phrase in function_phrase:
for sent_phrase in app_phrase:
sent_phrase = ''.join(sent_phrase)
sent_phrase = re.sub('[^0-9A-Za-z\u4e00-\u9fa5]', '', sent_phrase)
if sent_phrase != '' and sent_phrase not in final_function_phrase:
final_function_phrase.append(sent_phrase)
# 结果写入tsv文件中
train_set_file = open(self.output_file, 'w', newline='')
csv.register_dialect('tsv_dialect', delimiter='\t')
writer = csv.writer(train_set_file, dialect='tsv_dialect')
for final_app_phrase in final_function_phrase:
writer.writerow([1, final_app_phrase])
train_set_file.close()
if __name__ == '__main__':
# extract the set of candidate phrases from the given app description file
input_file = sys.argv[1]
output_file = sys.argv[2]
test = CandidatePhraseExtractor(input_file, output_file)
test.read_data_from_file()
test.get_seg_sentence_from_pyltp()
test.get_postage_and_parser()