-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetTfidf.py
213 lines (174 loc) · 7.07 KB
/
getTfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import sys
import os
import pandas as pd
import pickle as p
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def no_tokenizer(doc):
return doc
def getTfidf(li_tokens, li_filenames, filename, min_df=0, max_df=0, top_n=25):
"""
Creates a csv with the top n highest scoring tf-idf words.
:param input, list of tokens. Should be unpickled first
:param filename, the name of the output folder, based on the input
:param max_df, sklearn TfidfVectorizer function to filter words that appear in all token lists
:
"""
# Create an output folder if it doesn't exist yet
if not os.path.exists('output/'):
os.makedirs('output/')
if not os.path.exists('output/tfidf/'):
os.makedirs('output/tfidf/')
if min_df == 0:
min_df = len(li_tokens)
else:
min_df = len(li_tokens) - min_df
print('Terms must appear in at least ' + str(min_df) + ' of the total ' + str(len(li_tokens)) + ' files.')
if max_df == 0:
max_df = len(li_tokens)
else:
max_df = len(li_tokens) - max_df
print('Terms may not appear in ' + str(max_df) + ' of the total ' + str(len(li_tokens)) + ' files.')
output = 'output/tfidf/' + filename + '_tfidf.csv'
print('Vectorizing!')
tfidf_vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, analyzer='word', token_pattern=None, tokenizer=lambda i:i, lowercase=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(li_tokens)
#print(tfidf_matrix[:10])
feature_array = np.array(tfidf_vectorizer.get_feature_names())
tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
# Print and store top n highest scoring tf-idf scores
top_words = feature_array[tfidf_sorting][:top_n]
#print(top_words)
weights = np.asarray(tfidf_matrix.mean(axis=0)).ravel().tolist()
df_weights = pd.DataFrame({'term': tfidf_vectorizer.get_feature_names(), 'weight': weights})
df_weights = df_weights.sort_values(by='weight', ascending=False).head(100)
#df_weights.to_csv(output[:-4] + '_top100_terms.csv')
#print(df_weights.head())
df_matrix = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names())
# Turn the dataframe 90 degrees
df_matrix = df_matrix.transpose()
#print('Amount of words: ' + str(len(df_matrix)))
print('Writing tf-idf vector to csv')
# Do some editing of the dataframe
df_matrix.columns = li_filenames
cols = df_matrix.columns.tolist()
cols = li_filenames
df_matrix = df_matrix[cols]
#df_matrix.to_csv(output[:-4] + '_matrix.csv')
df_full = pd.DataFrame()
print('Writing top ' + str(top_n) + ' terms per token file to "' + output[:-4] + '_full.csv"')
# Store top terms per doc in a csv
for index, doc in enumerate(df_matrix):
df_tim = (df_matrix.sort_values(by=[doc], ascending=False))[:top_n]
df_timesep = pd.DataFrame()
df_timesep[doc] = df_tim.index.values[:top_n]
df_timesep['tfidf_score_' + str(index + 1)] = df_tim[doc].values[:top_n]
df_full = pd.concat([df_full, df_timesep], axis=1)
df_full.to_csv(output[:-4] + '_full.csv')
print('Writing a Rankflow-proof csv to "' + output[:-4] + '_rankflow.csv"')
df_rankflow = df_full
#df_rankflow = df_rankflow.drop(df_rankflow.columns[0], axis=1)
cols = df_rankflow.columns
for index, col in enumerate(cols):
#print(col)
if 'tfidf' in col:
li_scores = df_rankflow[col].tolist()
vals = [int(tfidf * 100) for tfidf in li_scores]
df_rankflow[col] = vals
df_rankflow.to_csv(output[:-4] + '_rankflow.csv', encoding='utf-8', index=False)
print('Done!')
# show manual if needed
if len(sys.argv) < 2:
print()
print("Creates a csv file of a tf-idf results.")
print("Use getTokens.py first to get a folder with tokens which this script can then use.")
print()
print("Usage: python3 getTfidf.py [--source] [--min_count] [--min_df] [--max_df] [--top]")
print()
print("--source: the relative path to a folder containing tokens created with getTokens (e.g. 'output/obama_tokens/').")
print("--max_df (optional): filters out the terms that appear less than this amount in one token file.\n E.g. --min_count=50 will delete all the words that appear less than 50 times in the month/week.")
print("--max_df (optional): filters out the terms that appear in less than this amount.\nE.g. with tokens from 12 months, --min_df=5 will mean terms must appear in 5 months.")
print("--max_df (optional): filters out the terms that appear in all documents.\nE.g. with tokens from 12 months, --max_df=1 will mean terms can only appear in 11 months.")
print("--top (optional): default 25. the amount of terms to keep in the csv.")
print()
print("Example:")
print("python3 getTfidf.py --source=data/obama_tokens/ --max_df=1 --top=50")
print()
sys.exit(1)
else:
li_args = []
source = ''
min_count = 0
max_df = 0
min_df = 1
top = 25
# Interpret command line arguments
for arg in sys.argv:
if "--source=" in arg:
source = arg[9:len(arg)]
li_args.append(source)
if not source.endswith('/'):
source = source + '/'
# Use the folder name as the filename for the tokens
filename = str(source).split('/')
filename = filename[len(filename) - 2]
li_args.append(filename)
elif "--min_count=" in arg:
min_count = int(arg[12:len(arg)])
li_args.append(min_count)
elif "--max_df=" in arg:
max_df = int(arg[9:len(arg)])
li_args.append(max_df)
elif "--min_df=" in arg:
min_df = int(arg[9:len(arg)])
li_args.append(min_df)
elif "--top=" in arg:
top = int(arg[6:len(arg)])
li_args.append(top)
print(li_args)
if source == '' or not os.path.isdir(source):
print("Please provide a valid input file like this: --source=output/obama_tokens/")
sys.exit(1)
li_tokens = []
li_filenames = []
di_counts = {}
# Unpickle the tokens
for index, file in enumerate(os.listdir(source)):
if file.endswith('.p'):
li_filenames.append(file)
token_filename = source + file
tokens = p.load(open(token_filename, 'rb'))
# Make a flat list of tokens for the full document
time_tokens = []
for tokenlist in tokens:
#print(tokenlist)
words = set(tokenlist)
#print(words)
for word in words:
time_tokens.append(word)
# Keep track of how many times the word is used
if word not in di_counts:
di_counts[word] = 1
else:
di_counts[word] = di_counts[word] + 1
li_tokens.append(time_tokens)
#tokens = [token for tokenlist in tokens for token in tokenlist]
#li_tokens.append(tokens)
print('Files to compare and get tf-idf terms for:')
print(li_filenames)
#print(di_counts[:10])
# Get rid of rarely used terms
print('Deleting words that appeared less than ' + str(min_count) + ' times')
if min_count != 0:
li_frequent_tokens = []
for index, all_tokens in enumerate(li_tokens):
frequent_tokens = []
for single_token in all_tokens:
if di_counts[single_token] >= min_count:
frequent_tokens.append(single_token)
li_frequent_tokens.append(frequent_tokens)
print(str(len(frequent_tokens)) + ' terms in file ' + str(index + 1) + ' used more than ' + str(min_count) + ' times.')
li_tokens = li_frequent_tokens
print('Getting tf-idf terms for the following token files:')
print(li_filenames)
getTfidf(li_tokens, li_filenames, filename, min_df=min_df, max_df=max_df, top_n=top)