-
Notifications
You must be signed in to change notification settings - Fork 83
/
Copy pathnew_words_mining.py
63 lines (49 loc) · 1.58 KB
/
new_words_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
# @Date : 2020/12/8
# @Author : mingming.xu
# @Email : xv44586@gmail.com
# @File : pmi.py
"""
根据PMI 挖掘新词与短语
ref: [最小熵原理(二):“当机立断”之词库构建](https://kexue.fm/archives/5476)
"""
import os
from nlp_zero import *
import jieba
jieba.initialize()
path = '/home/mingming.xu/datasets/NLP/ccf_qa_match/'
def load_data(train_test='train'):
D = {}
with open(os.path.join(path, train_test, train_test + '.query.tsv')) as f:
for l in f:
span = l.strip().split('\t')
D[span[0]] = {'query': span[1], 'reply': []}
with open(os.path.join(path, train_test, train_test + '.reply.tsv')) as f:
for l in f:
span = l.strip().split('\t')
if len(span) == 4:
q_id, r_id, r, label = span
else:
label = None
q_id, r_id, r = span
D[q_id]['reply'].append([r_id, r, label])
d = []
for k, v in D.items():
q = v['query']
reply = v['reply']
cor = [q] + [r[1] for r in reply]
d.append(''.join(cor))
return d
train_data = load_data('train')
test_data = load_data('test')
class G(object):
def __iter__(self):
for i in train_data + test_data:
yield i
f = Word_Finder(min_proba=1e-5)
f.train(G())
f.find(G())
# 长度为2~5 且不在jieba 词典内的词
new_words = [w for w, _ in f.words.items() if len(w) > 2 and len(w) < 5 and len(jieba.lcut(w, HMM=False)) > 1]
with open('new_dict.txt', 'w') as f:
f.write('\n'.join(new_words))