forked from Rockyzsu/getProxy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
116 lines (102 loc) · 3.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# -*- coding=utf-8 -*-
__author__ = 'Rocky'
import urllib2, time, datetime
from lxml import etree
import sqlite3,time
class getProxy():
def __init__(self):
self.user_agent = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
self.header = {"User-Agent": self.user_agent}
self.dbname="proxy.db"
self.now = time.strftime("%Y-%m-%d")
def getContent(self, num):
nn_url = "http://www.xicidaili.com/nn/" + str(num)
#国内高匿
req = urllib2.Request(nn_url, headers=self.header)
resp = urllib2.urlopen(req, timeout=10)
content = resp.read()
et = etree.HTML(content)
result_even = et.xpath('//tr[@class=""]')
result_odd = et.xpath('//tr[@class="odd"]')
#因为网页源码中class 分开了奇偶两个class,所以使用lxml最方便的方式就是分开获取。
#刚开始我使用一个方式获取,因而出现很多不对称的情况,估计是网站会经常修改源码,怕被其他爬虫的抓到
#使用上面的方法可以不管网页怎么改,都可以抓到ip 和port
for i in result_even:
t1 = i.xpath("./td/text()")[:2]
print "IP:%s\tPort:%s" % (t1[0], t1[1])
if self.isAlive(t1[0], t1[1]):
self.insert_db(self.now,t1[0],t1[1])
for i in result_odd:
t2 = i.xpath("./td/text()")[:2]
print "IP:%s\tPort:%s" % (t2[0], t2[1])
if self.isAlive(t2[0], t2[1]):
self.insert_db(self.now,t2[0],t2[1])
def insert_db(self,date,ip,port):
dbname=self.dbname
try:
conn=sqlite3.connect(dbname)
except:
print "Error to open database%" %self.dbname
create_tb='''
CREATE TABLE IF NOT EXISTS PROXY
(DATE TEXT,
IP TEXT,
PORT TEXT
);
'''
conn.execute(create_tb)
insert_db_cmd='''
INSERT INTO PROXY (DATE,IP,PORT) VALUES ('%s','%s','%s');
''' %(date,ip,port)
conn.execute(insert_db_cmd)
conn.commit()
conn.close()
def loop(self,page=5):
for i in range(1,page):
self.getContent(i)
#查看爬到的代理IP是否还能用
def isAlive(self,ip,port):
proxy={'http':ip+':'+port}
print proxy
#使用这个方式是全局方法。
proxy_support=urllib2.ProxyHandler(proxy)
opener=urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
#使用代理访问腾讯官网,进行验证代理是否有效
test_url="http://www.qq.com"
req=urllib2.Request(test_url,headers=self.header)
try:
#timeout 设置为10,如果你不能忍受你的代理延时超过10,就修改timeout的数字
resp=urllib2.urlopen(req,timeout=10)
if resp.code==200:
print "work"
return True
else:
print "not work"
return False
except :
print "Not work"
return False
#查看数据库里面的数据时候还有效,没有的话将其纪录删除
def check_db_pool(self):
conn=sqlite3.connect(self.dbname)
query_cmd='''
select IP,PORT from PROXY;
'''
cursor=conn.execute(query_cmd)
for row in cursor:
if not self.isAlive(row[0],row[1]):
#代理失效, 要从数据库从删除
delete_cmd='''
delete from PROXY where IP='%s'
''' %row[0]
print "delete IP %s in db" %row[0]
conn.execute(delete_cmd)
conn.commit()
conn.close()
if __name__ == "__main__":
now = datetime.datetime.now()
print "Start at %s" % now
obj=getProxy()
obj.loop(5)
obj.check_db_pool()