forked from Serenitas/topic-modeller
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtf_idf_builder.py
47 lines (37 loc) · 1.31 KB
/
tf_idf_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import math, collections, operator, time
def build_td_idf_dict(corpora_origin):
corpora = []
count = len(corpora_origin)
for text in corpora_origin:
text = text.split(' ')
res = []
for w in text:
res.append(w.strip('\n'))
corpora.append(res)
documents_list = []
words_docs = dict()
for text in corpora:
for w in text:
if w in words_docs:
words_docs[w] = words_docs[w] + 1
else:
words_docs[w] = 1
for text in corpora:
tf_idf_dictionary = {}
computed_tf = compute_tf(text)
for word in computed_tf:
word = word.strip('\n')
tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, words_docs) #compute_idf(word, corpora)
sorted_dict = sorted(tf_idf_dictionary.items(), key=operator.itemgetter(1), reverse=True)
documents_list.append(sorted_dict)
return documents_list
def compute_tf(text):
tf_text = collections.Counter(text)
for i in tf_text:
tf_text[i] = tf_text[i] / float(len(text))
return tf_text
def compute_idf(word, corpus):
#return math.log(len(corpus)/sum([1.0 for i in corpus if word in i]))
return math.log(len(corpus)/corpus[word])
def build_tf_idf_collocations_dict(file):
return