-
Notifications
You must be signed in to change notification settings - Fork 14
/
universe.py
executable file
·93 lines (76 loc) · 3.61 KB
/
universe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import settings
import time
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import normalize
from itertools import count
class Universe(object):
"""Universe: Universal variables like global terms and global frequencies"""
def __init__(self, debug):
self.debug = debug
self.n_items = 0
self.terms_positions = dict()
self.n_terms = 0
self.df = np.array([])
self.log_n_df = np.array([])
self.step = 0
self.ordered_list_of_terms = []
def increase_items(self):
self.n_items+=1
def remove_n_items(self, n):
self.n_items-=n
def remove_df(self, removing_df):
self.df -= removing_df
def update_terms(self, terms):
for term in terms:
if term not in self.terms_positions:
self.terms_positions[term] = self.n_terms
self.n_terms += 1
self.df = np.append(self.df, 1)
self.ordered_list_of_terms.append(term)
else:
term_position = self.terms_positions[term]
self.df[term_position] += 1
def get_universal_item_counts(self, terms, counts):
universal_count = sparse.dok_matrix((self.n_terms, 1))
for term, count in zip(terms, counts):
term_position = self.terms_positions[term]
universal_count[term_position, 0] = count
return universal_count
def get_universal_tfidf(self, universal_count):
tfidf = universal_count.T.multiply(self.log_n_df).T
tfidf = normalize(tfidf, norm='l2', axis=0)
return tfidf
def compute_log_n_df(self):
self.debug.log("Computing log_n_df...")
operation_time = time.time()
self.log_n_df = np.log(self.n_items/self.df)
# warning by zero division, tweets with this terms are not more active
self.log_n_df[self.df==0] = 0
# log(1)=0 !!!
self.log_n_df[self.df==self.n_items] = 0
self.debug.log("\tFinished in: "+str(time.time()-operation_time))
self.debug.log("\tlog_n_df shape: "+str(self.log_n_df.shape))
def prune_terms(self, clusters):
self.step += 1
if settings.pruning and self.step % settings.pruning_frequency == 0:
self.debug.log("Pruning terms...")
operation_time = time.time()
terms_to_remove_positions = np.where(self.df == 0)[0]
self.remove_terms(terms_to_remove_positions)
clusters.remove_terms(terms_to_remove_positions)
self.debug.log("\tFinished in: "+str(time.time()-operation_time))
def remove_terms(self, terms_to_remove_positions):
self.debug.log("\tRemoving "+str(len(terms_to_remove_positions))+" term positions from universal df...")
self.debug.log("\tInitial shape:"+str(self.df.shape))
self.df = np.delete(self.df, terms_to_remove_positions)
self.debug.log("\tFinal shape:"+str(self.df.shape))
self.debug.log("\tRemoving "+str(len(terms_to_remove_positions))+" terms from ordered list of terms...")
self.debug.log("\tInitial shape:"+str(len(self.ordered_list_of_terms)))
self.ordered_list_of_terms = list(np.delete(self.ordered_list_of_terms, terms_to_remove_positions))
self.debug.log("\tFinal shape:"+str(len(self.ordered_list_of_terms)))
self.terms_positions = {key:val for key, val in zip(self.ordered_list_of_terms, count())}
self.debug.log("\tFinal lenght of list of terms: "+str(len(self.ordered_list_of_terms)))
self.n_terms = len(self.terms_positions)
self.debug.log("\tFinal amount of terms: "+str(self.n_terms))