-
Notifications
You must be signed in to change notification settings - Fork 0
/
new-trained.py
262 lines (207 loc) · 11.3 KB
/
new-trained.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
#import required packages
import pandas as pd
import re
import html.parser
import urllib.request
import urllib.parse
import string
def read_glove_vecs(file):
with open(file, 'r') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
word = line[0]
words.add(word)
word_to_vec_map[word] = np.array(line[1:], dtype=np.float64)
return words, word_to_vec_map
words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt') # replace file path with your location for 50-d embeddings
# size of embeddings.
vector_dim = 50
#read 'data_dump' file
df1 = pd.read_csv('data_dump.txt' ,sep = '\t', header = None)
# code for translating text to english, inspired by code shared on GitHub
agent = {'User-Agent':
"Mozilla/4.0 (\
compatible;\
MSIE 6.0;\
Windows NT 5.1;\
SV1;\
.NET CLR 1.1.4322;\
.NET CLR 2.0.50727;\
.NET CLR 3.0.04506.30\
)"}
def unescape(text):
parser = html.parser.HTMLParser()
return (parser.unescape(text))
def translate(sent_to_translate, to_language="auto", from_language="auto"):
sent_to_translate = urllib.parse.quote(sent_to_translate)
link = "https://translate.google.com/m?hl={}&sl={}&q={}".format(to_language, from_language, sent_to_translate)
request = urllib.request.Request(link, headers=agent)
data = urllib.request.urlopen(request).read().decode("utf-8")
translation = re.findall(r'class="t0">(.*?)<', data)
if (len(translation) == 0):
result = ''
else:
result = unescape(translation[0])
return result
#pass all entries from data set into translator, and add them to list 'lst'
# and print out no. completed after every 500 samples
j = 0
lst = []
for i in df1[0]:
j+=1
lst.append(translate(i, to_language = 'en'))
if j%500 == 0: print ('{} entries completed'.format(j))
# make a new column in the dataframe that shows the translated text
df1['new'] = lst
#separate entries like '2am' to '2 am', ie, add space b/w number and text after making it lower case.
df1.new = df1.new.map(lambda x: (re.sub('(\d)([a-zA-Z]+)', r'\1 \2', x.lower())))
#transforms the text into a list, with words separated from punctuations (just doing a text.split() would have tokens where
#punctuations immediately following a character would not be separated from the character)
df1.new = df1.new.map(lambda x: re.findall('[\w]+|[,;.?!#&]', x))
# the common stop words in the english language. I didn't have access to nltk.stopwords, so I did it this way, as
# the stop words wore easily available on a google search
# transformed into a list
lst3 = 'i,me,my,myself,we,our,ours,ourselves,you,your,yours,yourself,yourselves,he,him,his,himself,she,her,hers,herself,it,its,itself,they,them,their,theirs,themselves,what,which,who,whom,this,that,these,those,am,is,are,was,were,be,been,being,have,has,had,having,do,does,did,doing,a,an,the,and,but,if,or,because,as,until,while,of,at,by,for,with,about,against,between,into,through,during,before,after,above,below,to,from,up,down,in,out,on,off,over,under,again,further,then,once,here,there,when,where,why,how,all,any,both,each,few,more,most,other,some,such,no,nor,not,only,own,same,so,than,too,very,s,t,can,will,just,don,should,now'
lst3 = lst3.split(',')
# for use later on
def cosine_similarity(x, y):
# Compute the dot product between u and v (≈1 line)
dot = np.dot(x,y)
# Compute the L2 norm of u (≈1 line)
norm_x = np.sqrt(np.sum(x**2))
# Compute the L2 norm of v (≈1 line)
norm_y = np.sqrt(np.sum(y**2))
# Compute the cosine similarity defined by formula (1) (≈1 line)
cosine_similarity = dot/(norm_x * norm_y)
return cosine_similarity
# train your own word embeddings on top of pre-trained GLoVe vectors
# training our own word vectors on our own corpus helps find further similarities between the words
# TO MAKE DATA INTO THE FORMAT REQUIRED BY KERAS:
# make the entire text from the data set into one huge corpus, with each example separated from the rest
# by 4 spaces (4 has been used because later on, during the generation of skipgrams, a window size of 4 wil be used
# to identify context words of a target word)
# if the spaces weren't added, in the huge corpus, if one sentene ends in a positive way like 'good', and the
# next sentence started with 'fuck' then 'good' would turn into a context word for 'fuck', thereby pushing their embeddings
# towards each other, making their cosine similarity increase
lst4 = []
for i in df1.index:
lst4+=df1.new.loc[i] + [(' '), (' '), (' '), (' ')]
# do similar pre-processing to remove punctuations and stop words
[lst4.remove(i) for i in lst4 if i in string.punctuation]
[lst4.remove(i) for i in lst4 if i in lst3]
# make it into a series to get unique entries of the huge corpus
# (set() could have been used, but it generates an unordered set of words, which changes the index being assigned to each word
# each time the code is run.)
series = pd.Series(lst4)
dic = {}
# make a dictionary of words with corresponding indexes
# Here, the index of a particular word remains same every time the code is run
for index,word in enumerate(series.unique()):
dic[word] = index
# transform the huge corpus into corresponding indexes
for i,j in enumerate(lst4):
lst4[i] = dic[j]
# TRAIN NEW WORD EMBEDDINGS ON CORPUS
# import necessary keras modules
from keras.preprocessing import sequence
from keras.layers import Dot, Reshape, Dense
from keras.models import Model
# size of the vocabulary ,ie, no. of unique words in corpus
vocab_size = len(dic) + 1
# sampling table used to make skipgrams, so that in the negative samples, the most common words are assigned a lower weight
sampling_table = sequence.make_sampling_table(vocab_size)
# make the skipgrams from the corpus, with a window size of 4 for the context words and use samples generated by previous line
# returns tuple of (target word, context word) and associated label of the tuple (1 for whether context in tuple is in fact
# context for the word in the actual data set, 0 otherwise)
tuples, labels = sequence.skipgrams(lst4, vocab_size, window_size=window_size, sampling_table=sampling_table)
# extract the target and context words and convert them into arrays (bear in mind that target and context words are
# now represented by their corresponding indexes from 'dic' dictionary)
target_word, context_word = zip(*tuples)
word_target = np.array(target_word, dtype="int32")
word_context = np.array(context_word, dtype="int32")
# make a new embedding matrix. The pre-trained GloVe vectors are going to be loaded into this matrix
# initialise with zeros
embedding_matrix = np.zeros((vocab_size, vector_dim))
# corresponding to the index of each row of embedding matrix, fill in the values of 50 dimensional word embedddings
for word,index in dic.items():
try:
embedding_matrix[index,:] = word_to_vec_map[word]
except:
continue # if word is not present in GloVe vectors, that index position is already filled with zeros, as we had initialized
# all rows to zero in the first place
# START BUILDING THE KERAS MODEL FOR TRAINING
input_target = Input((1,))
input_context = Input((1,))
# make a Keras embedding layer of shape (vocab_size, vector_dim) and set 'trainable' argument to 'True'
embedding = Embedding(input_dim = vocab_size, output_dim = vector_dim, input_length = 1, name='embedding', trainable = True)
# load pre-trained weights(embeddings) from 'embedding_matrix' into the Keras embedding layer
embedding.build((None,))
embedding.set_weights([embedding_matrix])
# run the context and target words through the embedding layer
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
# compute the dot product of the context and target words, to find the similarity (dot product is usually a measure of similarity)
dot = Dot(axes = 1)([context, target])
dot = Reshape((1,))(dot)
# pass it through a 'sigmoid' activation neuron; this is then comapared with the value in 'label' generated from the skipgram
out = Dense(1, activation = 'sigmoid')(dot)
# create model instance
model = Model(input = [input_context, input_target], output = out)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
# fit the model, default batch_size of 32
# running for 10 epochs seems to generate good enough results, although running for more iterations may improve performance further
model.fit(x = [word_target, word_context], y = labels, epochs = 10,)
# get the new word embeddings and save the new array of shape (vocab_size, vector_dim), to 'word_vecs'.
# here, the second layer of the model is the embedding layer, as can be seen from the index '[2]'
word_vecs = model.layers[2].get_weights()[0]
# NOTE: since 'make_sampling table' and 'skipgrams' select context and target words randomly, each time the model is trained,
# it will give rise to slightly different word embeddings that ultimately will give poor results when used to find cosine similarity
# with the existing comparing values, such as .45 for 'fuck' etc
# So, in order to obtain results good results with existing comparison values, load the embeddings of shape
# (vocab_size, vector_dim) from 'weights.npy' file
# please make sure that you have loaded the 'weights.npy' file into the current directory
word_vecs = np.load('weights.npy')
# loop through data set and lookup the cosine similarites of the sentence with embeddings of different words
# such as 'appropriate', 'fucking' etc.
# The values have been hard coded after a lot of experimentation, and attempts to strike a balance between recall and
# precision, although it is impossible to get an exact figure for each without a supervised approach.
df1['new1_trained'] = 0
lst2 = []
for i in df1.index:
lst = []
words = [j for j in df1.new.loc[i] if j.isalpha()]
words = [j for j in words if not j in string.punctuation]
words = [j for j in words if not j in lst3 + ['hello', 'hi', 'hey']]
for word in words:
try:
lst.append(word_vecs[dic[word]]) # new embeddings accessed through index of particular word in 'dic'
except:
continue
arr = np.array(lst)
arrsum = arr.sum(axis = 0)
if type(arrsum) != np.ndarray:
arrsum = np.array([0] * vector_dim)
else:
arrsum = arrsum/np.sqrt((arrsum**2).sum())
lst2.append(arrsum)
df1['new1_trained'] = lst2
df1['inappropriate'] = 0
for i in df1.index:
if cosine_similarity(word_vecs[dic['appropriate']], df1.new1_trained.loc[i]) < -0.1:
df1['inappropriate'].loc[i] = 1
if cosine_similarity(word_vecs[dic['fuck']], df1.new1_trained.loc[i]) > 0.45:
df1['inappropriate'].loc[i] = 1
if cosine_similarity(word_vecs[dic['suck']], df1.new1_trained.loc[i]) > 0.45:
df1['inappropriate'].loc[i] = 1
if cosine_similarity(word_vecs[dic['sex']], df1.new1_trained.loc[i]) > 0.4:
df1['inappropriate'].loc[i] = 1
if cosine_similarity(word_vecs[dic['horny']], df1.new1_trained.loc[i]) > 0.25:
df1['inappropriate'].loc[i] = 1
if cosine_similarity(word_vecs[dic['anal']], df1.new1_trained.loc[i]) > 0.35:
df1['inappropriate'].loc[i] = 1
# rename columns, drop unnecessary ones, convert to csv file
df1.rename(columns = {0: 'Messages', 'inappropriate' : 'Inappropriate'}).drop(['new', 'new1', 'new1_trained'], axis = 1).to_csv('New-Trained.csv', index = False)