-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathranker.py
180 lines (159 loc) Β· 9.42 KB
/
ranker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from math import sqrt
from math import log10
import numpy as np
import math
class Ranker:
def __init__(self):
# FREE PARAMETERS
self.B = 0.75
self.K = 1.2
# you can change whatever you want in this module, just make sure it doesn't
# break the searcher module
@staticmethod
def rank_relevant_docs(relevant_docs, k=None):
"""
This function provides rank for each relevant document and sorts them by their scores.
The current score considers solely the number of terms shared by the tweet (full_text) and query.
:param k: number of most relevant docs to return, default to everything.
:param relevant_docs: dictionary of documents that contains at least one term from the query.
:return: sorted list of documents by score
"""
ranked_results = sorted(relevant_docs.items(), key=lambda item: item[1], reverse=True)
if k is not None:
ranked_results = ranked_results[:k]
#print(ranked_results)
return [d[0] for d in ranked_results]
def getAxiomaticTermWeightingScore(self, relevant_docs, posting_dic, tweet_index, term, document):
N = len(tweet_index)
f = posting_dic[term][document]
term_frequency_in_query = 1
mone = f * math.pow(N, 0.35)
field_len = tweet_index[document][1]
avg_field_len = self.total_len(relevant_docs, tweet_index) / N
mehane = (f + 0.5 + (0.5 * float(field_len) / avg_field_len)) * len(posting_dic[term])
return mone / mehane
def BM25(self, relevant_docs, query, posting_dic, tweet_index):
"""
Calculate the score for each document according to the Okapi BM25
"""
document_score = {}
N = len(tweet_index)
avg_field_len = self.total_len(relevant_docs, tweet_index)/N
for document in relevant_docs.keys():
score = 0
for term in query:
if document in posting_dic[term].keys():
f = posting_dic[term][document]
field_len = tweet_index[document][1]
#term_score = self.getAxiomaticTermWeightingScore(relevant_docs,posting_dic, tweet_index, term, document)
score += (self.IDF(posting_dic, term, N) * ((self.K+1)*f)/(f+self.K*(1-self.B+self.B*(field_len/avg_field_len))))#+(term_score)
document_score[document] = score
return document_score
def IDF(self, posting_dic, term, N):
"""
Return the Inverse document frequency weight of the query term
"""
idf = np.log(((N - len(posting_dic[term]) + 0.5)/(len(posting_dic[term]) + 0.5)) + 1)
return idf
def total_len(self, relevant_docs, tweet_index):
"""
Return the total document lenght in the text collection from which documents are drawn
"""
total = 0
for doc, value in relevant_docs.items():
total += tweet_index[doc][1]
return total
@staticmethod
def _rank_relevant_doc(relevant_doc, number_of_tweet, tweet_file, term_in_query):
"""
This function provides rank for each relevant document and sorts them by their scores.
The current score considers solely the number of terms shared by the tweet (full_text) and query.
:param relevant_doc: dictionary of documents that contains at least one term from the query.
:return: sorted list of documents by score
"""
tweet_id = {}
df = {}
for term in relevant_doc.keys():
df[term] = len(relevant_doc[term])
for data in relevant_doc[term]:
if data[0] not in tweet_id:
tweet_id[data[0]] = [[term, data[1], data[2]]]
else:
tweet_id[data[0]].append([term, data[1], data[2]])
#print(tweet_id)
ranked_tweet = Ranker.cosSim(tweet_id, df, number_of_tweet, tweet_file, term_in_query)
#print(ranked_tweet)
return sorted(ranked_tweet.items(), key=lambda item: item[1], reverse=True)
#return sorted(relevant_doc.items(), key=lambda item: item[1], reverse=True)
@staticmethod
def retrieve_top_k(sorted_relevant_doc, k=1):
"""
return a list of top K tweets based on their ranking from highest to lowest
:param sorted_relevant_doc: list of all candidates docs.
:param k: Number of top document to return
:return: list of relevant document
"""
if k > len(sorted_relevant_doc):
return sorted_relevant_doc
result = [i for i in sorted_relevant_doc[:k]]
return result
@staticmethod
def cosSim(tweet_id, df, number_of_tweet, tweet_file, term_in_query):
ranked_tweet = {}
#tweet_file = utils.load_obj('tweet_index')
#inverted_index = utils.load_obj('inverted_index')
#print(inverted_index)
#print(inverted_index['COVID19'])
#print("term query : {}".format(term_in_query))
for id in tweet_id.keys():
tweet_data = tweet_id[id]
for data in tweet_data:
if id not in ranked_tweet:
# print(" tweet file by id {}".format( tweet_file[id]))
#print(data)
# print(data[1])
# print("***{}***".format(data[0]))
# print(inverted_index[data[0][:-1]])
#if id == '1280966287972237314':
# print(self.tf(tweet_file[id], data[1])*self.idf(data[0]))
# print(inverted_index[data[0][:-1]])
# print(tweet_file[id])
#ranked_tweet[id] = self.tf(tweet_file[id], data[1])*self.idf(data[0])
# print(id)
# print(data[1])
# print(tweet_file[id])
# print(df[data[0]])
# print(term_in_query[data[0][:-1]])
try:
ranked_tweet[id] = int(data[1])/int(tweet_file[id][0]) * log10(number_of_tweet/df[data[0]]) * term_in_query[data[0][:-1]]
except:
ranked_tweet[id] = int(data[1]) / int(tweet_file[id][0]) * log10(number_of_tweet / df[data[0]])
else:
#if id == '1280966287972237314':
# print(self.tf(tweet_file[id], data[1])*self.idf(data[0]))
# print(inverted_index[data[0][:-1]])
# print(tweet_file[id])
#ranked_tweet[id] += self.tf(tweet_file[id], data[1])*self.idf(data[0])
# print(id)
# print(data[1])
# print(tweet_file[id])
# print(df[data[0]])
# print(term_in_query[data[0][:-1]])
try:
ranked_tweet[id] = int(data[1]) / int(tweet_file[id][0]) * log10(
number_of_tweet / df[data[0]]) * term_in_query[data[0][:-1]]
except:
ranked_tweet[id] = int(data[1]) / int(tweet_file[id][0]) * log10(number_of_tweet / df[data[0]])
Wiq = 0
for term in term_in_query.keys():
Wiq += term_in_query[term]*term_in_query[term]
#print(Wiq)
for id in ranked_tweet.keys():
ranked_tweet[id] /= sqrt(Wiq*tweet_file[id][1]*tweet_file[id][1])
return ranked_tweet
if __name__ == '__main__':
r = Ranker()
dic = {'COVID19 ': [('1280966292908974083', 1, [15]), ('1280966293588344832', 1, [25]), ('1280966294624403461', 1, [13]), ('1280966295350054913', 1, [5]), ('1280966296595763201', 1, [16]), ('1280966297614876672', 1, [3]), ('1280966298219032577', 1, [4]), ('1280966299087253505', 1, [30]), ('1280966301788319744', 1, [13]), ('1280966304241860608', 1, [9]), ('1280966305546469378', 1, [16]), ('1280966306808819712', 1, [17]), ('1280966307266072582', 1, [9]), ('1280966309551865857', 1, [17]), ('1280966309795246082', 1, [10]), ('1280966313435774977', 1, [13]), ('1280966313481994240', 1, [14]), ('1280966314433982464', 1, [15]), ('1280966314756988928', 1, [13]), ('1280966317357572096', 1, [24]), ('1280966318150180865', 1, [14]), ('1280966320255893506', 1, [13]), ('1280966321409269760', 1, [14]), ('1280966321354805249', 1, [14]), ('1280966322646470656', 1, [15]), ('1280966325783932934', 3, [14, 18, 20]), ('1280966330280222720', 1, [16]), ('1280966330431295488', 1, [10]), ('1280966330670354432', 1, [13]), ('1280966332599734277', 1, [20]), ('1280966332121591813', 1, [19]), ('1280966334013038592', 1, [19]), ('1280966333711233026', 1, [15]), ('1280966336961826827', 1, [10]), ('1280966336982781953', 1, [13]), ('1280966337351823360', 1, [13])]}
rd = r.rank_relevant_doc(dic)
print(r.retrieve_top_k(rd , 5))
#r.rank_relevant_doc({'fucking ': [('1280966300202725376', 1, [2]), ('1280966300525715456', 1, [7]), ('1280966302522318849', 1, [7]), ('1280966305324171264', 1, [2]), ('1280966306854920192', 1, [7]), ('1280966312110493696', 1, [7]), ('1280966328640307200', 1, [3]), ('1280966330372493312', 1, [3])], '#coronavirus ': [('1280966293370425349', 1, [16]), ('1280966295442411521', 1, [27]), ('1280966296612593664', 1, [22]), ('1280966302664884226', 1, [12]), ('1280966305831673856', 1, [15]), ('1280966319207309316', 1, [9]), ('1280966325783932934', 1, [17]), ('1280966332599734277', 1, [19])], '@JoeBiden ': [('1280966306292998145', 1, [1]), ('1280966310357356554', 1, [1]), ('1280966317357572098', 1, [1]), ('1280966336550785026', 1, [1])]})