-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmiddlewares.py
176 lines (146 loc) · 6.24 KB
/
middlewares.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
import time
'''
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from scrapy.http.response.html import HtmlResponse
class SelenuimDownloadMiddleware(object):
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option('prefs', prefs)
self.driver = webdriver.Chrome(executable_path=r"chromedriver", chrome_options=chrome_options)
def process_request(self, request, spider):
print(">>>>>>>> " + request.url)
self.driver.get(request.url)
time.sleep(1)
try:
while True:
next_page = self.driver.find_element_by_class_name('下一页')
next_page.click()
time.sleep(1)
if not next_page:
break
except:
pass
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url, body=source, request=request)
return response
'''
class ScrapySpiderSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ScrapySpiderDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self, agents):
self.agents = agents
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
# s = cls()
# crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
# return s
return cls(crawler.settings.getlist('USER_AGENTS'))
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
request.headers.setdefault('User-Agent', random.choice(self.agents))
time.sleep(random.random()*50) # for solve the 403
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# from scrapy.http import HtmlResponse
# from selenium import webdriver
# from selenium.common.exceptions import TimeoutException
# from gp.configs import *
#
# class ChromeDownloaderMiddleware(object):
#
# def __init__(self):
# options = webdriver.ChromeOptions()
# options.add_argument('--headless') # 设置无界面
# if CHROME_PATH:
# options.binary_location = CHROME_PATH
# if CHROME_DRIVER_PATH:
# self.driver = webdriver.Chrome(chrome_options=options, executable_path=CHROME_DRIVER_PATH) # 初始化Chrome驱动
# else:
# self.driver = webdriver.Chrome(chrome_options=options) # 初始化Chrome驱动
#
# def __del__(self):
# self.driver.close()
#
# def process_request(self, request, spider):
# try:
# print('Chrome driver begin...')
# self.driver.get(request.url) # 获取网页链接内容
# return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8',
# status=200) # 返回HTML数据
# except TimeoutException:
# return HtmlResponse(url=request.url, request=request, encoding='utf-8', status=500)
# finally:
# print('Chrome driver end...')