forked from qiangsiwei/bert_distill
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
127 lines (119 loc) · 5.43 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
import jieba, random, fileinput, numpy as np
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
def load_data(name):
def get_w2v():
for line in open('data/cache/word2vec', encoding="utf-8").read().strip().split('\n'):
line = line.strip().split()
if not line: continue
yield line[0], np.array(list(map(float, line[1:])))
tokenizer = Tokenizer(filters='', lower=True, split=' ', oov_token=1)
texts = [' '.join(jieba.cut(line.split('\t', 1)[1].strip())) \
for line in open('data/{}/{}.txt'.format(name, name), encoding="utf-8",
).read().strip().split('\n')]
tokenizer.fit_on_texts(texts)
# with open('word2vec','w') as out:
# for line in fileinput.input('sgns.sogou.word'):
# word = line.strip().split()[0]
# if word in tokenizer.word_index:
# out.write(line+'\n')
# fileinput.close()
x_train, y_train = [], [];
text_train = []
for line in open('data/{}/train.txt'.format(name), encoding="utf-8").read().strip().split('\n'):
label, text = line.split('\t', 1)
text_train.append(text.strip())
x_train.append(' '.join(jieba.cut(text.strip())))
y_train.append(int(label))
x_train = tokenizer.texts_to_sequences(x_train)
x_dev, y_dev = [], []
text_dev = []
for line in open('data/{}/dev.txt'.format(name), encoding="utf-8").read().strip().split('\n'):
label, text = line.split('\t', 1)
text_dev.append(text.strip())
x_dev.append(' '.join(jieba.cut(text.strip())))
y_dev.append(int(label))
x_dev = tokenizer.texts_to_sequences(x_dev)
x_test, y_test = [], []
text_test = []
for line in open('data/{}/test.txt'.format(name), encoding="utf-8").read().strip().split('\n'):
label, text = line.split('\t', 1)
text_test.append(text.strip())
x_test.append(' '.join(jieba.cut(text.strip())))
y_test.append(int(label))
x_test = tokenizer.texts_to_sequences(x_test)
v_size = len(tokenizer.word_index) + 1
embs, w2v = np.zeros((v_size, 300)), dict(get_w2v())
for word, index in tokenizer.word_index.items():
if word in w2v: embs[index] = w2v[word]
return (x_train, y_train, text_train), \
(x_dev, y_dev, text_dev), \
(x_test, y_test, text_test), \
v_size, embs
def load_data_aug(name, n_iter=20, p_mask=0.1, p_ng=0.25, ngram_range=(3,6)):
def get_w2v():
for line in open('data/cache/word2vec', encoding="utf-8").read().strip().split('\n'):
line = line.strip().split()
if not line: continue
yield line[0], np.array(list(map(float, line[1:])))
tokenizer = Tokenizer(filters='', lower=True, split=' ', oov_token=1)
texts = [' '.join(jieba.cut(line.split('\t', 1)[1].strip())) \
for line in open('data/{}/{}.txt'.format(name, name), encoding="utf-8",
).read().strip().split('\n')]
tokenizer.fit_on_texts(texts)
x_train, y_train = [], [];
text_train = []
for line in open('data/{}/train.txt'.format(name), encoding="utf-8").read().strip().split('\n'):
label, text = line.split('\t', 1)
text = text.strip()
# preserve one original sample first
text_train.append(text)
x_train.append(' '.join(jieba.cut(text)))
y_train.append(int(label))
# data augmentation
used_texts = {text}
for i in range(n_iter):
words = jieba.lcut(text)
# word masking
words = [x if np.random.rand() < p_mask else "[MASK]" for x in words]
# n-gram sampling
if np.random.rand() < p_ng:
n_gram_len = np.random.randint(ngram_range[0], ngram_range[1]+1)
n_gram_len = min(n_gram_len, len(words))
n_gram_start = np.random.randint(0, len(words)-n_gram_len+1)
words = words[n_gram_start:n_gram_start+n_gram_len]
new_text = "".join(words)
if new_text not in used_texts:
text_train.append(new_text)
x_train.append(' '.join(words))
y_train.append(int(label))
used_texts.add(new_text)
x_train = tokenizer.texts_to_sequences(x_train)
x_dev, y_dev = [], []
text_dev = []
for line in open('data/{}/dev.txt'.format(name), encoding="utf-8").read().strip().split('\n'):
label, text = line.split('\t', 1)
text_dev.append(text.strip())
x_dev.append(' '.join(jieba.cut(text.strip())))
y_dev.append(int(label))
x_dev = tokenizer.texts_to_sequences(x_dev)
x_test, y_test = [], []
text_test = []
for line in open('data/{}/test.txt'.format(name), encoding="utf-8").read().strip().split('\n'):
label, text = line.split('\t', 1)
text_test.append(text.strip())
x_test.append(' '.join(jieba.cut(text.strip())))
y_test.append(int(label))
x_test = tokenizer.texts_to_sequences(x_test)
v_size = len(tokenizer.word_index) + 1
embs, w2v = np.zeros((v_size, 300)), dict(get_w2v())
for word, index in tokenizer.word_index.items():
if word in w2v: embs[index] = w2v[word]
return (x_train, y_train, text_train), \
(x_dev, y_dev, text_dev), \
(x_test, y_test, text_test), \
v_size, embs
if __name__ == '__main__':
# load_data(name='hotel')
load_data_aug(name='hotel')