-
Notifications
You must be signed in to change notification settings - Fork 0
/
vectorial_search.py
145 lines (109 loc) · 6.46 KB
/
vectorial_search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from collections import Counter, defaultdict
from reverse_index_builder import Reverse_index_builder
from math import log10
import operator
class Vectorial_search:
SIMILARITY_COSINE = 'cosine'
SIMILARITY_DICE = 'dice'
SIMILARITY_JACCARD = 'jaccard'
SIMILARITY_OVERLAP = 'overlap'
SIMILARITY_MODEL_LIST = [SIMILARITY_COSINE, SIMILARITY_DICE, SIMILARITY_JACCARD, SIMILARITY_OVERLAP]
def __init__(self, reverse_index, similarity=SIMILARITY_COSINE):
similarity = similarity.lower()
if similarity not in self.SIMILARITY_MODEL_LIST:
raise ValueError(similarity)
else:
self.reverse_index = reverse_index
self.similarity_method = similarity
self.ponderation = self.reverse_index.other_infos['ponderation_method']
def do_search(self, query):
"""
Vectorial search on a query that has been processed by Process_query.
"""
# Only search on words that are actually in the corpus
significant_query_words = list(set(query).intersection(set(self.reverse_index.get_all_words())))
# Do the search
similarities = self._search(significant_query_words)
# Removing documents with similarity of 0
positive_similarities = {}
for document_id, similarity in similarities.iteritems():
if similarity > 0:
positive_similarities[document_id] = similarity
# Rank
return sorted(similarities.items(), key=operator.itemgetter(1), reverse=True)
def _search(self, query_words):
document_similarities = {}
query_weights = self._query_weight(query_words, self.reverse_index.idf)
query_norms = {
'linear': sum(query_weights.values()),
'quadratic': sum(map(lambda x: x*x, query_weights.values()))
}
documents_unnormalized_similarities = defaultdict(float)
# Multiply ponderations from document and from query
for word in query_words:
for document_id in self.reverse_index.get_ids_for_term(word):
ponderation = self.reverse_index.get_ponderation(word, document_id)
documents_unnormalized_similarities[document_id] += getattr(
self, '_get_search_numerator_' + self.similarity_method
)(ponderation, query_weights[word])
# Then, divide each document by the normalizing function for given similarity method
for document_id in documents_unnormalized_similarities:
denominator = getattr(self, '_get_normalizing_term_' + self.similarity_method)(
self.reverse_index.other_infos['norms'][document_id],
query_norms,
documents_unnormalized_similarities[document_id]
)
document_similarities[document_id] = documents_unnormalized_similarities[document_id] / float(denominator)
return document_similarities
def _get_search_numerator_cosine(self, document_word_ponderation, query_weight):
return document_word_ponderation * query_weight
def _get_search_numerator_dice(self, document_word_ponderation, query_weight):
return 2 * document_word_ponderation * query_weight
def _get_search_numerator_jaccard(self, document_word_ponderation, query_weight):
return document_word_ponderation * query_weight
def _get_search_numerator_overlap(self, document_word_ponderation, query_weight):
return document_word_ponderation * query_weight
def _get_normalizing_term_cosine(self, document_norms, query_norms, document_unnormalized_similarities):
return (document_norms['quadratic'] * query_norms['quadratic']) ** 0.5
def _get_normalizing_term_dice(self, document_norms, query_norms, document_unnormalized_similarities):
return document_norms['linear'] + query_norms['linear']
def _get_normalizing_term_jaccard(self, document_norms, query_norms, document_unnormalized_similarities):
return document_norms['linear'] + query_norms['linear'] - document_unnormalized_similarities
def _get_normalizing_term_overlap(self, document_norms, query_norms, document_unnormalized_similarities):
return min(document_norms['linear'], query_norms['linear'])
def _query_weight(self, query_words, idf_counter):
tf_counter = Counter(query_words)
# We assume that the corpus is so large that the bag of words of the query won't significantly change the idf count
# Or, in other words, that we don't care about it.
# See http://nlp.stanford.edu/IR-book/html/htmledition/queries-as-vectors-1.html
if self.ponderation == Reverse_index_builder.PONDERATION_TF_IDF:
query_weights = self._query_weight_tf_idf(query_words, idf_counter, tf_counter)
elif self.ponderation == Reverse_index_builder.PONDERATION_NORMAL_TF_IDF:
query_weights_unnormalized = self._query_weight_tf_idf(query_words, idf_counter, tf_counter)
query_weights = defaultdict(float)
for word in query_weights_unnormalized:
query_weights[word] = query_weights_unnormalized[word] / float(self.reverse_index.other_infos['max_unnormalized_ponderation'][word])
elif self.ponderation == Reverse_index_builder.PONDERATION_NORMAL_FREQUENCY:
query_weights = self._query_weight_normalized_frequency(query_words)
return query_weights
def _query_weight_tf_idf(self, query_words, idf_counter, tf_counter):
query_weights = defaultdict(float)
N = self.reverse_index.other_infos['number_of_documents']
for word in query_words:
query_weights[word] = (1 + self._custom_log(tf_counter[word])) * log10(float(N) / idf_counter[word])
return query_weights
def _query_weight_normalized_frequency(self, query_words):
word_count = Counter(query_words)
# When the query does not have any significant word.
if len(word_count) == 0:
return {}
maximum_frequency = word_count.most_common(1)[0][1] # Find the highest frequency in the query
query_weights = defaultdict(float)
for word in query_words:
query_weights[word] = word_count[word] / float(maximum_frequency)
return query_weights
def _custom_log(self, number, base=10):
if number > 0:
return log10(float(number))
else:
return 0