-
Notifications
You must be signed in to change notification settings - Fork 0
/
ranker_tf_idf.py
69 lines (66 loc) · 2.89 KB
/
ranker_tf_idf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import utils
import math
from datetime import timedelta
from timeit import default_timer as timer
from numpy.dual import norm
import numpy
class Ranker:
def __init__(self):
pass
@staticmethod
def rank_relevant_docs(relevant_doc, qurey, indexer, model_vector = None, k=None):
"""
This function provides rank for each relevant document and sorts them by their scores.
The current score considers solely the number of terms shared by the tweet (full_text) and query.
:param k: number of most relevant docs to return, default to everything.
:param relevant_docs: dictionary of documents that contains at least one term from the query.
:return: sorted list of documents by score
"""
config = indexer.config
inverted_index = indexer.inverted_idx#keys-terms. values- [line number in post file,number of documents appears in,total appearance in corpus]
documents_data = indexer.documents_data#keys- document id. values- [max freq term, number of difrent words, number of words]
idf_dict = {}
docs_dict = {}
tf_idf_q = 0
#calculating tf-idf for query
for term in qurey:
df_term = inverted_index[term][1]
total_docs = len(documents_data.keys())
idf_term = math.log((total_docs / df_term), 2)
idf_dict[term] = idf_term
#tf term in query
tf_term_q = qurey[term]
tf_idf_q += tf_term_q * idf_term
for doc_id in relevant_doc.keys():
for term in relevant_doc[doc_id][1]: # key= doc_id, value= (num_of_terms_in_qury, [(term,num_of_term_appears)])
term_name = term[0]
tf_term = int(term[1]) / documents_data[doc_id][2]
idf_term = float(idf_dict[term_name])
tf_idf = tf_term * idf_term
if doc_id in docs_dict.keys():
docs_dict[doc_id] += tf_idf
else:
docs_dict[doc_id] = tf_idf
cosine_list = []
for doc in docs_dict:
inner_prodect = docs_dict[doc] * tf_idf_q
doc_weight = documents_data[doc][3] #lenght of doc
q_len = math.pow(tf_idf_q, 2)
doc_len = math.pow(doc_weight,2)
mechane = math.sqrt(q_len*doc_len)
# mul = doc_len * q_len
if mechane != 0:
cosine = inner_prodect/mechane
else:
cosine = 0
# cosine = inner_prodect / mechane
dot = inner_prodect
calRank = (0.3*dot) + (0.7* cosine)
tup = (calRank, doc)#similarity, doc_id
if calRank < 0.15:
continue
cosine_list.append(tup)
rank_list_sorted = sorted(cosine_list, reverse=True)
if k is not None:
rank_list_sorted = rank_list_sorted[:k]
return [d[1] for d in rank_list_sorted]