-
Notifications
You must be signed in to change notification settings - Fork 15
/
Spider.py
180 lines (149 loc) · 6.49 KB
/
Spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python2
import logging
logging.basicConfig(level=logging.DEBUG,
format='%(name)s - %(message)s')
import requests
from bs4 import BeautifulSoup
import pdfkit
import time
from ParseOut import ParseOutYear, ParseOutTitle, ParseOutContent, ParseOutTag, ParseOutURL
class Spider:
def __init__(self, url,
p_key=[],
n_key=[],
score_level=0,
key_score={'p': 1, 'n': -3, 'p_none': 1, 'n_none': -1, 'none': -5},
weighting={'title': 1.5, 'content': 1},
page=5,
parser='html.parser',
googleScholarURL="http://scholar.google.com.tw"):
self.url = url
self.p_key = p_key
self.n_key = n_key
self.score_level = score_level
self.key_score = key_score
self.weighting = weighting
self.page = page
self.parser = parser
self.__googleScholarURL = googleScholarURL
def crawl(self):
logger = logging.getLogger('crawl')
page_urls = []
page_urls.append(self.url)
page_urls += self.__findPages()
results = []
for index, page_url in enumerate(page_urls):
res = requests.get(page_url)
soup = BeautifulSoup(res.text, self.parser)
print "You are now in page ", (index + 1), " !!!"
### Test if the crawler is blocked by the Google robot check
page_links = soup.select('div[id="gs_nml"] a')
if not page_links:
logger.info('1.Google robot check might ban you from crawling!!')
logger.info('2.You might not crawl the page of google scholar')
### Try to crawl the page no matter it might be banned by Google robot check
results += self.__crawlPage(soup, index + 1)
time.sleep(4)
return results
def __findPages(self):
logger = logging.getLogger('__findPages')
res = requests.get(self.url)
soup = BeautifulSoup(res.text, self.parser)
page_url = []
page_links = soup.select('div[id="gs_nml"] a')
if not page_links:
logger.debug('Can not find the pages link in the start URL!!')
logger.info('1.Google robot check might ban you from crawling!!')
logger.info('2.You might not crawl the page of google scholar')
else:
counter = 0
for page_link in page_links:
counter += 1
if (counter >= self.page):
break
page_url.append(self.__googleScholarURL + page_link['href'])
return page_url
def __crawlPage(self, soup, page_index):
logger = logging.getLogger('__crawlBlock')
counter = 0
results = []
blocks = soup.select('div[class="gs_r gs_or gs_scl"]')
for block in blocks:
counter += 1
result = {}
try:
b_title = block.select('h3 a')[0].text #Title
result['b_title'] = b_title
except:
### If there is no title in this block, ignore this block
logger.debug("No Title in Page %s Block %s", page_index, counter)
continue
try:
b_content = block.select('div[class="gs_rs"]')[0].text #Content
result['content'] = b_content
except:
logger.debug("No Content in Page %s Block %s", page_index, counter)
result['content'] = None
try:
b_url = block.select('h3 a')[0]['href'] #URL
b_url = ParseOutURL(b_url)
result['url'] = b_url
except:
### If there is no URL in this block, ignore this block
logger.debug("No URL in Page %s Block %s", page_index, counter)
continue
try:
b_year = block.select('div[class="gs_a"]')[0].text #Year
b_year = ParseOutYear(b_year)
result['year'] = b_year
except:
logger.debug("No Year in Page %s Block %s", page_index, counter)
result['year'] = None
### Check keywords in titles and contents
### Evaluate the score of titles and contents by keywords
f_title, t_score = ParseOutTitle(result['b_title'], self.p_key, self.n_key, self.key_score)
result['f_title'] = f_title
content, c_score = ParseOutContent(result['content'], self.p_key, self.n_key, self.key_score)
result['require'], result['score'] = self.__requireThesis(t_score, c_score)
### Record the required thesis with tag.
### Set result['tag'], result[tag_link] to None.
### If the thesis is required and its tag is [PDF] or [HTML],
### set result['tag'] to 'PDF' or 'HTML' and also record the
### link in result[tag_link]
result['tag'] = None
result['tag_link'] = None
if result['require']:
tag = block.select('div[class="gs_ggsd"] a')
if tag:
tag_link = tag[0]['href']
tag_text = tag[0].text
tag_text = ParseOutTag(tag_text)
result['tag'] = tag_text
result['tag_link'] = tag_link
### Set result['download'] to False,
### because the thesis hasn't been downloaded
result['download'] = False
### test only the first link in each page
# break
### Append the information
### ('title', 'year', 'content', 'require', 'download')
### of the block in results
results.append(result)
return results
def __requireThesis(self, t_score, c_score):
score = self.weighting['title'] * t_score + self.weighting['content'] * c_score
if (score > self.score_level):
return (True, score)
else:
return (False, score)
# def __getPDF(self, url, title, year):
# res = requests.get(url)
# print "in __getPDF"
# f_name = year + " - " + title.strip() + '.pdf'
# with open(f_name, 'wb') as f:
# print "Downloading PDF... " + title
# f.write(res.content)
# def __getHTML2PDF(self, url, title, year):
# options = {'page-size': 'A4', 'dpi': 400}
# f_name = year + " - " + title.strip() + '.pdf'
# pdfkit.from_url(url, f_name, options = options)