-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
123 lines (102 loc) · 3.56 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Program to create tokenized terms
# from given corpus
# Parse HTML from corpus
# Tokenize
# output to file
import os
import sys
from bs4 import BeautifulSoup
from bs4.element import Comment
import nltk
from nltk import PorterStemmer
def visible(element):
# return true only if text
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
elif isinstance(element, Comment): # filter out comments
return False
return True
def parsehtml(rawcontent):
# Parse HTML code to extract text
soup = BeautifulSoup(rawcontent, 'html.parser')
data = soup.findAll(text=True)
text = filter(visible, data)
return u" ".join(t.strip() for t in text)
def tokenizer(direc_path):
# get stop-list
# change working directory to specified one
# process all files and tokenize
# get all lines of stop-list
# remove newline character at the end
fp = open('stoplist.txt', 'r')
stoplist = list(fp)
fp.close()
for i in range(len(stoplist)):
stoplist[i] = stoplist[i][:-1]
# change path to where corpus is
current_dir = os.getcwd()
os.chdir(direc_path)
flist = []
flist.extend(os.listdir(direc_path))
term_dictionary = {}
doc_dictionary = {}
doc_id = 1
term_id = 1
# main loop
for fname in os.listdir():
# read file and add its name and ID to a dictionary
fp = open(fname, 'r', errors='ignore')
content = fp.read()
fp.close()
doc_dictionary[doc_id] = fname
doc_id += 1
# ignoring initial headers
substr = "<!DOCTYPE"
index = content.find(substr)
htmlcode = content[index:]
# get parsed result
result = parsehtml(htmlcode)
# Tokenize and turn to lower case
token_list = nltk.regexp_tokenize(result, "[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+")
for i in range(len(token_list)):
token_list[i] = token_list[i].lower()
# print(token_list)
# print(len(token_list))
# ignore tokens if they're in stop list
i = 0
deleteflag = False
while i < len(token_list):
for s in stoplist:
if s == token_list[i]:
del token_list[i]
deleteflag = True
break
if deleteflag:
deleteflag = False
else:
i += 1
# print(len(token_list))
# stem the token list
stemmer = PorterStemmer()
for i in range(len(token_list)):
token_list[i] = stemmer.stem(token_list[i])
# put terms as key in dictionary with incremented term id as value
for i in range(len(token_list)):
if token_list[i] not in term_dictionary:
term_dictionary[token_list[i]] = term_id
term_id = term_id + 1
# write doc dictionary to file, format is term id /t term
# term ids become keys and terms become values
os.chdir(current_dir)
f = open('docids.txt', 'w')
for value, key in doc_dictionary.items():
f.write(str(key) + '\t' + str(value) + '\n')
f.close()
# write term dictionary to file
f = open('termids.txt', 'w', errors='ignore')
for key, value in term_dictionary.items():
f.write(str(value) + '\t' + str(key) + '\n')
f.close()
return (term_dictionary,doc_dictionary)
directory = sys.argv[1]
tokenizer(directory)