-
Notifications
You must be signed in to change notification settings - Fork 0
/
searcher_SpellingCorrection.py
140 lines (125 loc) · 6.36 KB
/
searcher_SpellingCorrection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from datetime import timedelta
from timeit import default_timer as timer
from spellchecker import SpellChecker
from ranker_tf_idf import Ranker
import utils
# DO NOT MODIFY CLASS NAME
class Searcher:
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit. The model
# parameter allows you to pass in a precomputed model that is already in
# memory for the searcher to use such as LSI, LDA, Word2vec models.
# MAKE SURE YOU DON'T LOAD A MODEL INTO MEMORY HERE AS THIS IS RUN AT QUERY TIME.
def __init__(self, parser, indexer, model=None):
self._parser = parser
self._indexer = indexer
self._renker = Ranker()
self._model = model
# DO NOT MODIFY THIS SIGNATURE
# You can change the internal implementation as you see fit.
def search(self, query, k=None):
"""
Executes a query over an existing index and returns the number of
relevant docs and an ordered list of search results (tweet ids).
Input:
query - string.
k - number of top results to return, default to everything.
Output:
A tuple containing the number of relevant search results, and
a list of tweet_ids where the first element is the most relavant
and the last is the least relevant result.
"""
p = self._parser
start_qury = timer()
query_as_list = p.parse_sentence(query) # returnes a list of words
advance_query = {} # key- term. value - tf of the term in qurey
start_searcher = timer()
relevant_docs = self._relevant_docs_from_posting(query_as_list)
end_searcher = timer()
# print(str(timedelta(seconds=end_searcher - start_searcher)) + "searcher time")
for term in query_as_list:
if term in relevant_docs.keys():
advance_query[term] = query_as_list.count(term) / len(query_as_list)
elif term.lower() in relevant_docs.keys():
advance_query[term.lower()] = query_as_list.count(term) / len(query_as_list)
relevant_doc_dict = self.get_relevant_doc_dict(relevant_docs) # key= doc_id, value= (num_of_terms appears_in_doc from qury, [(terms,num_of_term_appears)])
relevant_doc_dict = sorted(relevant_doc_dict.items(), key=lambda item: item[1][0], reverse=True)
relevant_doc_dict = dict(relevant_doc_dict[0:2000]) if len(relevant_doc_dict) > 2000 else dict(relevant_doc_dict)
# relevant_doc_dict = sorted(relevant_doc_dict.keys(), key=lambda x:x[0],reverse=True)
start_renking = timer()
if self._model != None:
ranked_docs = self._renker.rank_relevant_docs(relevant_doc_dict, advance_query,self._indexer, self._model)
else:
ranked_docs = self._renker.rank_relevant_docs(relevant_doc_dict, advance_query,self._indexer)
end_qury = timer()
# print(str(timedelta(seconds=end_qury - start_renking)) + "ranking time")
# print(str(timedelta(seconds=end_qury - start_qury)) + "qury time")
return len(ranked_docs) , ranked_docs
# feel free to change the signature and/or implementation of this function
# or drop altogether.
def _relevant_docs_from_posting(self, query_as_list):
"""
This function loads the posting list and count the amount of relevant documents per term.
:param query_as_list: parsed query tokens
:return: dictionary of relevant documents mapping doc_id to document frequency.
"""
relevant_docs = {}
query = self.fix_query_spelling(query_as_list)
for term in query:
try:#collecting term data
#for cases like 'NILLI' or 'Donald Trump'
inverted_index = self._indexer.inverted_idx
posting_dict = self._indexer.postingDict
try:
if inverted_index[term][1] > self._indexer.config.get_cut_by():
continue
term_data = inverted_index[term]
term_line_in_posting = term_data[0][1]
file_name = term_data[0][0]
origin_lines = posting_dict[file_name]
original_term_data = origin_lines[term_line_in_posting]
relevant_docs[term] = original_term_data
except:
# lower case
term_data = inverted_index[term.lower()]
term_line_in_posting = term_data[0][1]
file_name = term_data[0][0]
origin_lines = posting_dict[file_name]
relevant_docs[term.lower()] = origin_lines[term_line_in_posting]# + original_term_data
except Exception:
pass#no words in dict
return relevant_docs #dict Keys- Term, Values- list of docs
def get_relevant_doc_dict(self, relevant_docs):
docs_dict = {}# key= doc_id, value= (num_of_terms appears_in_doc from qury, [(terms,num_of_term_appears)])
#relevant_docs = dict- key=term, value= [(num_of_term_appears, dic_id),(num_of_term_appears, dic_id)]
for term in relevant_docs.keys():
for doc_ditails in relevant_docs[term]:
doc_id = doc_ditails[1]
if doc_id in docs_dict.keys():
flag = False
#clean double docs in corpus
for term_in_doc in docs_dict[doc_id][1]:
if term_in_doc[0] == term:
flag = True
#not same term in same doc
if not flag:
sum_terms = docs_dict[doc_id][0] + 1
#details = docs_dict[doc_id]
docs_dict[doc_id] = (sum_terms, docs_dict[doc_id][1] + [(term, doc_ditails[0])])
#details1= docs_dict[doc_id]
else:
docs_dict[doc_id] = (1, [(term, doc_ditails[0])])
return docs_dict
def fix_query_spelling(self, query):
spell = SpellChecker()
# find those words that may be misspelled
misspelled = spell.unknown(query)
# try:
# for word in misspelled:
# query.remove(word)
# except:
# pass
for word in misspelled:
# Get the one `most likely` answer
query.append(spell.correction(word))
return query