-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpreprocessing.py
305 lines (238 loc) · 9.79 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
import gensim, itertools
import numpy as np
from params import FLAGS
import zipfile
import xml.etree.ElementTree as xmlParser
from nltk.tokenize import TweetTokenizer
import os
import sys
# takes char-embedding file path and returns character list and vectors, relatively. the order of character is used, with unichar() func. char value can be taken.
def readCharEmbeddings(path, embedding_size):
in_file = gensim.models.word2vec.LineSentence(path)
lines = lambda: itertools.islice(in_file, None)
model_tuple = lambda: ((line[0], [float(value) for value in line[1:]]) for line in lines())
model_dict = dict(model_tuple())
char_list = [int(char_number) for char_number in model_dict.keys() if
len(model_dict[char_number]) == embedding_size]
vectors = [embed for embed in model_dict.values() if len(embed) == embedding_size]
char_list.append(ord('U'))
vectors.append(np.random.randn(embedding_size))
char_list.append(ord('P'))
vectors.append(np.zeros(embedding_size))
return char_list, np.array(vectors)
##READS GLOVE EMBEDDINGS
def readGloveEmbeddings(path, embedding_size):
DOC_LIMIT = None
in_file = gensim.models.word2vec.LineSentence(path)
lines = lambda: itertools.islice(in_file, DOC_LIMIT)
model_tuple = lambda: ((line[0], [float(value) for value in line[1:]]) for line in lines())
model_dict = dict(model_tuple())
temp_vocab = list(model_dict.keys())
temp_vectors = list(model_dict.values())
vocab = list()
vectors = list()
# remove erratic embeddings
count = 0
for line in temp_vectors:
if len(line) == embedding_size: # problem here FLAGS instead of emb_size param!!!
vocab.append(temp_vocab[count])
vectors.append(temp_vectors[count])
count += 1
del temp_vectors, temp_vocab, model_dict
# add special tokens
vocab.append("UNK")
vectors.append(np.random.randn(embedding_size))
vocab.append("PAD")
vectors.append(np.zeros(embedding_size))
embeddings = np.array(vectors)
return vocab, embeddings
# reads PAN2017 - author profiling training dataset
# one-hot vectors: female = [0,1]
# male = [1,0]
# input: string = path to the zip-file corresponding to the training data
# output: list ("training_set") = author - tweet pairs
# dict ("target_values") = author(key) - ground-truth(value) pairs
# list ("seq-lengths") = lenght of each tweet in the list "training_set"
def readData(path, mode):
path = os.path.join(os.path.join(path, FLAGS.lang), "text")
tokenizer = TweetTokenizer()
training_set = []
target_values = {}
seq_lengths = []
# for each author
for name in os.listdir(path):
if mode != "Test":
# ground truth values are here
if name.endswith(".txt"):
file_name = os.path.join(path,name)
if sys.version_info[0] < 3:
text = open(file_name, 'r')
# each line = each author
for line in text:
words = line.strip().split(b':::')
if words[1].decode() == "female":
target_values[words[0].decode()] = [0, 1]
elif words[1].decode() == "male":
target_values[words[0].decode()] = [1, 0]
else:
text = open(file_name, 'r', encoding="utf8")
# each line = each author
for line in text:
words = line.strip().split(':::')
if words[1] == "female":
target_values[words[0]] = [0, 1]
elif words[1] == "male":
target_values[words[0]] = [1, 0]
# tweets are here
if name.endswith(".xml"):
# get author name from file name
base = os.path.basename(name)
author_id = os.path.splitext(base)[0]
# parse tweets
xml_file_name = os.path.join(path,name)
if sys.version_info[0] < 3:
xmlFile = open(xml_file_name, "r")
else:
xmlFile = open(xml_file_name, "r", encoding="utf8")
rootTag = xmlParser.parse(xmlFile).getroot()
# for each tweet
for documents in rootTag:
for document in documents.findall("document"):
words = tokenizer.tokenize(document.text)
training_set.append([author_id, words]) # author-tweet pairs
seq_lengths.append(len(words)) # length of tweets will be fed to rnn as timestep size
if mode != "Test":
return training_set, target_values, seq_lengths
else:
return training_set, None, seq_lengths
# takes list of tweet and list of characters
# returns character order of each character in each tweet
def char2id(tweets, char_list):
batch_tweet_ids = []
for tweet in tweets:
tweet_ids = []
for word in tweet:
for char in word:
if char != 'P':
char = char.lower()
try:
tweet_ids.append(char_list[ord(char)])
except:
tweet_ids.append(char_list[ord('U')])
batch_tweet_ids.append(tweet_ids)
return batch_tweet_ids
#### PREPARES TEST DATA ####
def prepTestData(tweets, user, target):
# prepare output
test_output = user2target(user, target)
# prepare input by adding padding
tweet_lengths = [len(tweet) for tweet in tweets]
max_tweet_length = max(tweet_lengths)
test_input = []
for i in range(len(tweets)):
tweet = tweets[i]
padded_tweet = []
for j in range(max_tweet_length):
if len(tweet) > j:
padded_tweet.append(tweet[j])
else:
padded_tweet.append("PAD")
test_input.append(padded_tweet)
return test_input, test_output
def user2target(users, targets):
target_values = []
for user in users:
target_values.append(targets[user])
return target_values
# takes all tweets, users and targets
# creates a batch as a list of characters according to given iteration no, batch size and
# size of maximum tweet
def prepCharBatchData(mode, tweets, users, targets, iter_no, batch_size, max_tweet_size):
start = iter_no * batch_size
end = iter_no * batch_size + batch_size
if end > len(tweets):
end = len(tweets)
batch_tweets = tweets[start:end]
batch_users = users[start:end]
# prepare output
if mode != "Test":
batch_output = user2target(batch_users, targets)
else:
batch_output = None
batch_input = list()
for tweet in batch_tweets:
tweet_char_list = list()
for word in tweet:
tweet_char_list.extend([char for char in word.lower()])
size = len(tweet_char_list)
if size < max_tweet_size:
for i in range(max_tweet_size - size):
tweet_char_list.append('P')
batch_input.append(tweet_char_list)
return batch_input, batch_output
# changes tokenized words to their corresponding ids in vocabulary
def word2id(tweets, vocab):
batch_tweet_ids = []
for tweet in tweets:
tweet_ids = []
for word in tweet:
if word != "PAD":
word = word.lower()
try:
tweet_ids.append(vocab[word])
except:
tweet_ids.append(vocab["UNK"])
batch_tweet_ids.append(tweet_ids)
return batch_tweet_ids
# prepares batch data - also adds padding to tweets
# input: tweets(list) - list of tweets corresponding to the authors in:
# users(list) - owner of the abve tweets
# targets(dict) - ground-truth gender vector of each owner
# seq_len(list) - sequence length for tweets
# iter_no(int) - current # of iteration we are on
# returns: batch_input - ids of each words to be used in tf_embedding_lookup
# batch_output - target values to be fed to the rnn
# batch_sequencelen - number of words in each tweet(gives us the # of time unrolls)
def prepWordBatchData(mode, tweets, users, targets, seq_len, iter_no):
start = iter_no * FLAGS.batch_size
end = iter_no * FLAGS.batch_size + FLAGS.batch_size
if end > len(tweets):
end = len(tweets)
batch_tweets = tweets[start:end]
batch_users = users[start:end]
batch_sequencelen = seq_len[start:end]
# prepare output
if mode != "Test":
batch_output_temp = user2target(batch_users, targets)
batch_output = batch_output_temp[0]
# prepare input by adding padding
tweet_lengths = [len(tweet) for tweet in batch_tweets]
max_tweet_length = max(tweet_lengths)
batch_input = []
for i in range(FLAGS.batch_size):
tweet = batch_tweets[i]
padded_tweet = []
for j in range(max_tweet_length):
if len(tweet) > j:
padded_tweet.append(tweet[j])
else:
padded_tweet.append("PAD")
batch_input.append(padded_tweet)
if mode != "Test":
return batch_input, np.asarray(batch_output).reshape(1, 2), batch_sequencelen
else:
return batch_input, None, batch_sequencelen
#############################################################################################################
def prepTestData(tweets, user, target, max_tweet_size):
test_output = user2target(user, target)
test_input = list()
for tweet in tweets:
tweet_char_list = list()
for word in tweet:
tweet_char_list.extend([char for char in word.lower()])
size = len(tweet_char_list)
if size < max_tweet_size:
for i in range(max_tweet_size - size):
tweet_char_list.append('P')
test_input.append(tweet_char_list)
return test_input, test_output