This repository has been archived by the owner on Sep 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexer.py
102 lines (91 loc) · 3.04 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pickle
# you need to change to your own documents file name here
with open("all_articles.txt") as file:
articles = file.read()
articles = articles.split("\n\n")
#delete empty elements
articles.pop(0)
articles.pop()
#define an empty structure for forward index
forward_index = {}
#define an empty structure for doc_no
DocNo_dict = {}
#a loop for articles
for i in range(len(articles)):
rslt = articles[i].split()
doc_no = rslt.pop(0)
DocNo_dict[i] = doc_no
word_counts = {}
#a loop for words in an article
for word in rslt:
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
forward_index[i] = word_counts
#binary saving doc_no as a pkl file
with open("doc_no.pkl", "wb") as f:
pickle.dump(DocNo_dict, f)
print("Primary keys has been saved in doc_no.pkl.")
#define an empty structure for inverse_index
inverse_index = {}
for article_no, words_collection in forward_index.items():
for word, counter in words_collection.items():
if word in inverse_index:
inverse_index[word].append([article_no, counter])
else:
inverse_index[word] = [[article_no, counter]]
print("Complete extract postings.")
def sort_key(sub_list):
return sub_list[1]
#sort postings by frequency
for word, the_list in inverse_index.items():
# 使用键函数对列表进行排序
sort_list = sorted(the_list, key=sort_key, reverse=True)
inverse_index[word] = sort_list
#sort inverse_index by alphabetical order
keys = list(inverse_index.keys())
keys.sort()
inverse_index_list = []
for key in keys:
temp = [key, inverse_index[key]]
inverse_index_list.append(temp)
#building a binary file for postings
with open('postings.pkl', 'wb') as f:
pass
#saving postings, return location and size
offsets = {}
for ele in inverse_index_list:
with open('postings.pkl', 'ab') as f:
offset_start = f.tell()
pickle.dump(ele[1], f)
offset_end = f.tell()
size = offset_end - offset_start
offsets[ele[0]] = [offset_start, size]
#prepare for dictionary
postings_index = offsets
keys = list(postings_index.keys())
keys.sort()
postings_index_list = []
for key in keys:
temp = [key, postings_index[key]]
postings_index_list.append(temp)
#build dictionary file and root file
with open('term.pkl', 'wb') as f:
pass
offsets = {}
#500 words for a block
group_size = 500
for i in range(0, len(postings_index_list), group_size):
with open('term.pkl', 'ab') as f:
offset_start = f.tell()
if i + group_size < len(postings_index_list):
pickle.dump(postings_index_list[i:i + group_size], f)
else:
pickle.dump(postings_index_list[i:], f)
offset_end = f.tell()
size = offset_end - offset_start
offsets[postings_index_list[i][0]] = [offset_start, size]
with open('key_index.pkl', 'wb') as f:
pickle.dump(offsets, f)
print("Completing dictionary and root files.")