forked from theanti9/PyCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
content_processor.py
152 lines (133 loc) · 3.9 KB
/
content_processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
from multiprocessing import Pool
import re, sys, logging, string
from ready_queue import ready_queue
logger = logging.getLogger("crawler_logger")
def rankKeywords(text):
invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
ranks = {}
text = text.split(' ')
exclude = set(string.punctuation)
for t in text:
#remove punctuation if attached to word
temp = t
t = ''
for i in range(len(temp)):
if(temp[i] not in exclude):
t += temp[i]
t = t.strip()
if t in invalid_keywords:
continue
if not ranks.has_key(t):
ranks[t] = 1
else:
ranks[t] += 1
return ranks
def stripPunctuation(text):
pattern = re.compile(r'[^\w\s]')
return pattern.sub(' ', text)
def stripScript(text):
pattern = re.compile(r'<script.*?\/script>')
return pattern.sub(' ', text)
class ContentProcessor:
def __init__(self, url, status, text):
self.keyword_dicts = []
self.invalid_keywords = ['', ' ', "i", "a", "an", "and", "the", "for", "be", "to", "or", "too", "also"]
self.keywords = {}
self.text = text
self.size = 0
self.url = url
self.status = status
def setText(self, text):
self.text = text
self.size = len(text)
def setUrl(self, url):
self.url = url
def setStatus(self, status):
self.status = status
def setInfo(self, url, status, text):
self.url = url
self.status = status
self.text = text
self.size = len(text)
def reset(self):
self.keyword_dicts = []
self.keywords = {}
self.text = None
self.head = None
self.body = None
self.title = None
self.size = 0
self.status = None
def combineKeywordLists(self):
if len(self.keyword_dicts) == 1:
self.keywords = self.keyword_dicts[0]
return
for l in self.keyword_dicts:
for k,v in l.items():
if self.keywords.has_key(k):
self.keywords[k] += v
else:
self.keywords[k] = v
# returns links to queue
def processBody(self):
queue = ready_queue(self.url, self.body)
#print "found %i links to queue" % len(queue)
self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body)))
if len(self.text) > 5000:
offset = 0
i = 0
l = []
cont = True
while cont:
#this divides the text into sets of 500 words
#set j to the index of the last letter of the 500th word
j = self.findnth(self.text[i:],' ',500)
#if only 500 words or less are left
if j == -1:
cont = False
#Should append a string that contains 500 words for each loop(except the last loop) to l
#last loop should append a string with 500 words or less to l
l.append(self.text[i:i+j])
i += j+1
logger.debug("processing with %i threads" % len(l))
try:
if len(l) == 0:
return []
pool = Pool(processes=(len(l)))
self.keyword_dicts = pool.map(rankKeywords, l)
except KeyboardInterrupt:
pool.terminate()
pool.join()
sys.exit()
else:
pool.close()
pool.join()
logger.debug("processed, returned %i dicts" % len(self.keyword_dicts))
else:
self.keyword_dicts.append(rankKeywords(self.text))
return queue
def processHead(self):
pass
def remove_html_tags(self, data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def findnth(self, haystack, needle, n):
parts = haystack.split(needle, n)
if len(parts) <= n:
return -1
return len(haystack)-len(parts[-1])-len(needle)
# returns the queue from processBody
def process(self):
text_lower = self.text.lower()
self.title = self.text[text_lower.find('<title')+6:text_lower.find('</title>')]
self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')]
self.processHead()
self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')]
queue = self.processBody()
self.combineKeywordLists()
return queue
def getDataDict(self):
for k,v in self.keywords.items():
if v < 3:
del self.keywords[k]
return {"address":self.url, "title":self.title, "status":self.status, "size":self.size, "keywords":self.keywords}