-
Notifications
You must be signed in to change notification settings - Fork 0
/
IndexConstructor_WithoutHash.py
203 lines (170 loc) · 6.42 KB
/
IndexConstructor_WithoutHash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import sys
from bs4 import BeautifulSoup
from bs4.element import Comment
import nltk
from nltk import PorterStemmer
import heapq
class PriorityNode:
def __init__(self, arraynumber, arrayindex, value):
self.arraynumber = arraynumber # array number from which value is picked
self.arrayindex = arrayindex # index within that array where the value was
self.value = value
def __lt__(self, other):
return self.value < other.value
class TermAppearance:
def __init__(self, termid):
self.termid = termid # term id
self.doclist = [] # a list of list containing document and positionid
self.totalappearance = 0
self.docsappeared = 0
def visible(element):
# return true only if text
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
elif isinstance(element, Comment): # filter out comments
return False
return True
def parsehtml(rawcontent):
# Parse HTML code to extract text
soup = BeautifulSoup(rawcontent, 'html.parser')
data = soup.findAll(text=True)
text = filter(visible, data)
return u" ".join(t.strip() for t in text)
def sortonterm(val):
return val[0], val[2]
def indexconstructor(direc_path):
# get stop-list
# change working directory to specified one
# process all files and tokenize
# get all lines of stop-list
# remove newline character at the end
fp = open('stoplist.txt', 'r')
stoplist = list(fp)
fp.close()
for i in range(len(stoplist)):
stoplist[i] = stoplist[i][:-1]
termdict = {}
fp = open("termids.txt", 'r')
termlist = list(fp)
fp.close()
for entry in termlist:
entry = entry[0:-1]
pair = entry.split('\t')
termdict[pair[1]] = int(pair[0])
termlist = []
# change path to where corpus is
current_dir = os.getcwd()
os.chdir(direc_path)
flist = []
flist.extend(os.listdir(direc_path))
doc_id = 1
term_id = 1
corpustuples = []
# main loop
for fname in os.listdir():
# read file and add its name and ID to a dictionary
fp = open(fname, 'r', errors='ignore')
content = fp.read()
fp.close()
# ignoring initial headers
substr = "<!DOCTYPE"
index = content.find(substr)
htmlcode = content[index:]
# get parsed result
result = parsehtml(htmlcode)
# Tokenize and turn to lower case
token_list = nltk.regexp_tokenize(result, "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+")
for i in range(len(token_list)):
token_list[i] = token_list[i].lower()
# ignore tokens if they're in stop list
i = 0
deleteflag = False
while i < len(token_list):
for s in stoplist:
if s == token_list[i]:
del token_list[i]
deleteflag = True
break
if deleteflag:
deleteflag = False
else:
i += 1
# stem the token list
stemmer = PorterStemmer()
for i in range(len(token_list)):
token_list[i] = stemmer.stem(token_list[i])
tuplelist = []
position_id = 1
# calculate unique term id, and make a new tuple of format termid, docid, positionid
for i in range(len(token_list)):
if token_list[i] in termdict:
term_id = termdict.get(token_list[i])
newtuple = (term_id, doc_id, position_id)
tuplelist.append(newtuple)
else:
print(token_list[i] + " is not in dictionary")
position_id += 1
doc_id += 1
# local sort
tuplelist.sort()
corpustuples.append(tuplelist)
# Priority queue for merging
prioritylist = []
i = 0
while i < len(corpustuples):
if len(corpustuples[i]) > 0:
node = PriorityNode(i, 0, corpustuples[i][0])
prioritylist.append(node)
i += 1
heapq.heapify(prioritylist)
# merging in final_list
final_list = []
while len(prioritylist) > 0:
node = heapq.heappop(prioritylist)
final_list.append(node.value)
if node.arrayindex + 1 < len(corpustuples[node.arraynumber]):
newnode = PriorityNode(node.arraynumber, node.arrayindex+1, corpustuples[node.arraynumber][node.arrayindex+1])
heapq.heappush(prioritylist, newnode)
term_index = []
i = 0
# making the index with docid and positionid gap encoded from final sorted list of tuples
# tuple attributes: termid, docid, positionid
# doc_list first appends all positions of a term for specific document then appends to node
while i < len(final_list):
curterm = final_list[i][0]
termnode = TermAppearance(curterm)
curpos = curdoc = 0
doc_list = []
while i < len(final_list) and final_list[i][0] == curterm:
if final_list[i][1] != curdoc:
doc_list = []
termnode.docsappeared += 1
doc_list.append(final_list[i][1] - curdoc)
curdoc = final_list[i][1]
curpos = 0
doc_list.append(final_list[i][2] - curpos)
curpos = final_list[i][2]
termnode.doclist.append(doc_list)
else:
doc_list.append(final_list[i][2] - curpos)
curpos = final_list[i][2]
termnode.totalappearance += 1
i += 1
term_index.append(termnode)
# writing to term_index.txt
os.chdir(current_dir)
fp = open('term_index.txt', 'w')
for i in range(len(term_index)):
fp.write(str(term_index[i].termid)+" "+str(term_index[i].totalappearance)+" "+str(term_index[i].docsappeared))
for j in range(len(term_index[i].doclist)):
fp.write(" ")
for k in range(len(term_index[i].doclist[j])):
if k < len(term_index[i].doclist[j]) - 1:
fp.write(str(term_index[i].doclist[j][k]) + ",")
else:
fp.write(str(term_index[i].doclist[j][k]))
fp.write("\n")
fp.close()
directory = sys.argv[1]
indexconstructor(directory)