-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreateTokens.py
206 lines (173 loc) · 6.37 KB
/
createTokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import sys
import os
import re
import pickle
import pandas as pd
import datetime
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
def createTokens(li_strings, output, stemming=False, lemmatizing=False, time=''):
"""
Saves tokens in pickle files.
"""
# Do some cleanup: only alphabetic characters, no stopwords
# Create separate stemmed tokens, to which the full strings will be compared to:
li_all_tokens = []
len_comments = len(li_strings)
print(len(li_strings))
time_log = ''
if time != '':
time_log = ' for date slice ' + time
print('Creating list of tokens' + time_log)
for index, comment in enumerate(li_strings):
# Create list of list for comments and tokens
if isinstance(comment, str):
li_tokens = []
li_tokens = getFilteredText(comment, stemming=stemming, lemmatizing=lemmatizing)
li_all_tokens.append(li_tokens)
if index % 100 == 0:
print('Tokenizing finished for string ' + str(index) + '/' + str(len_comments))
print('Saving tokens')
if time == '':
time = str(datetime.datetime.now().strftime("%Y%m%d%H%M%S"))
if not os.path.exists('output/'):
os.makedirs('output/')
if not os.path.exists('output/' + output):
os.makedirs('output/' + output)
pickle.dump(li_all_tokens, open('output/' + output + 'tokens_' + time + '.p', 'wb'))
def getFilteredText(string, stemming=False, lemmatizing=False):
#first, remove urls
if 'http' in string:
string = re.sub(r'https?:\/\/.*[\r\n]*', ' ', string)
if 'www.' in string:
string = re.sub(r'www.*[\r\n]*', ' ', string)
# get a list of words
tokens = re.findall("[a-zA-Z\-\)\(]{3,50}", string)
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
#list with tokens further processed
li_filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
token = token.lower()
# only alphabetic characters, keep '(' and ')' symbols for echo brackets, only tokens with three or more characters
if re.match('[a-zA-Z\-\)\(]{3,50}', token) is not None:
# no stopwords
if token not in stopwords.words('english'):
token = token.lower()
# shorten word if it's longer than 20 characters (e.g. 'reeeeeeeeeeeeeeeeeeeeeeeee')
if len(token) >= 20:
token = token[:20]
# stem if indicated it should be stemmed
if stemming:
token_stemmed = stemmer.stem(token)
li_filtered_tokens.append(token_stemmed)
# update lookup dict with token and stemmed token
# lookup dict is dict of stemmed words as keys and lists as full tokens
if token_stemmed in di_stems:
if token not in di_stems[token_stemmed]:
di_stems[token_stemmed].append(token)
else:
di_stems[token_stemmed] = []
di_stems[token_stemmed].append(token)
#if lemmatizing is used instead
elif lemmatizing:
token = lemmatizer.lemmatize(token)
# lemmatize 'hahahaha' - needed for 'kek' w2v similarities
if token[:5] == 'hahah':
token = 'haha'
elif token[:5] == 'ahaha':
token = 'ahah'
li_filtered_tokens.append(token)
else:
li_filtered_tokens.append(token)
return li_filtered_tokens
# Show manual if needed
if len(sys.argv) < 2:
print()
print("Tokenizes text from a csv column.")
print("Saves the results in binary files (using the library pickle).")
print("Can stem and/or lemmatize.")
print()
print("Usage: python3 createTokens.py [--source] [--text] [--stem] [--lemma] [--timespan]")
print()
print("--source: the relative path to a csv file from 4CAT (e.g. 'data/datasheet.csv').")
print("--text (optional): default 'body' - the csv column with the text.")
print("--stem (optional): default false - whether to stem the text.")
print("--lemma (optional): default true - whether to lemmatize the text.")
print("--timespan (optional): if provided, create output per month/day. Use 'months' or 'days'.")
print("--timecol (optional): default 'timespan' - the csv column in which the time values are stored. Should start with format yyyy-mm-dd.")
print()
print("Example:")
print(" python3 createTokens.py --source=input/4cat-datasheet.csv --text=body --stem=False --lemma=True --timespan=months --timecol=timestamp")
print()
sys.exit(1)
else:
li_args = []
source = ''
text_column = 'body'
time_column = 'timestamp'
stem = False
lemma = True
timespan = False
# Interpret command line arguments
for arg in sys.argv:
if arg[0:9] == "--source=":
source = arg[9:len(arg)]
li_args.append(source)
# Use the folder name as the folder name for the tokens
folder = str(source).split('/')
folder = folder[len(folder) - 1]
folder = folder[:-4] + '_tokens/'
li_args.append(folder)
elif arg[0:7] == "--output=":
output = arg[7:len(arg)]
li_args.append(output)
elif arg[0:7] == "--text=":
text_column = arg[7:len(arg)]
li_args.append(text_column)
elif arg[0:7] == "--stem=":
stem_input = arg[7:len(arg)]
if stem_input == 'true':
stem = True
li_args.append(stem)
elif arg[0:8] == "--lemma=":
lemma_input = arg[8:len(arg)]
if lemma_input == 'false':
lemma = False
li_args.append(lemma)
elif arg[0:11] == "--timespan=":
timespan = arg[11:len(arg)]
li_args.append(timespan)
elif arg[0:10] == "--timecol=":
time_column = arg[10:len(arg)]
li_args.append(time_column)
print(li_args)
if source == '' or not os.path.isfile(source):
print("Please provide a valid input file like this: --source=data/datasheet.csv")
sys.exit(1)
else:
df = pd.read_csv(source)
if timespan == False:
df = df.sort_values(by=[timecol])
li_input = df[text_column].tolist()
createTokens(li_input, stemming=stem, lemmatizing=lemma)
else:
print('Tokenizing per ' + timespan)
# Get the dates to make files for
li_all_dates = df[time_column].tolist()
if timespan == 'days':
li_dates = [date[0:10] for date in li_all_dates]
elif timespan == 'months':
li_dates = [date[0:7] for date in li_all_dates]
else:
print('Please provide a valid date format (\'days\' or \'months\')')
sys.exit(1)
li_dates = sorted(set(li_dates))
print('Timespans to make tokens for:')
print(li_dates)
for date_slice in li_dates:
df_date = df[df[time_column].str.contains(date_slice)]
li_input = df_date[text_column].tolist()
createTokens(li_input, folder, stemming=stem, lemmatizing=lemma, time=date_slice)