-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect_proxy.py
130 lines (119 loc) · 3.8 KB
/
collect_proxy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#coding:utf-8
from urllib import request
from fake_useragent import FakeUserAgent
from bs4 import BeautifulSoup
import re
import time
from thread_pool import thread_pool
def chekout_proxy(ip):
ip = {'http': ip}
proxy = request.ProxyHandler(ip)
opener = request.build_opener(proxy)
ua = FakeUserAgent()
url = 'http://www.baidu.com'
headinfo = {'User-Agent': ua.random}
reqhd = request.Request(url, headers=headinfo)
try:
req = opener.open(reqhd, timeout=5)
except Exception as e:
print ('invalid ip:', ip, e)
return
if req.code == 200:
return ip
class GetProxy(object):
def __init__(self, url = ''):
self.baseurl = url
self.ua = FakeUserAgent()
self.pools = []
def getIps(self):
return self.pools
def getByApi(self, url):
content = self.reqPage(url)
if content:
obj = BeautifulSoup(content, 'html5lib')
listip = [item for item in obj.stripped_strings if re.match(r'\d', item)]
self.pools.extend(listip)
def getCharset(self, content):
scon = str(content)
meta = re.search(r'<meta(.*?)content-type(.*?)>', scon, re.I)
if meta:
s = meta.group()
m = re.search(r'charset=(.*?)[\"\' /]', s, re.I)
if m:
charset = m.groups()[0]
return charset
return 'utf-8'
def reqPage(self, url):
time.sleep(2)
headinfo = {'UserAgent': self.ua.random}
reqhd = request.Request(url, headers=headinfo)
try:
req = request.urlopen(reqhd)
except Exception as e:
print ('Error:', e)
if req.code != 200:
return
con = req.read()
charset = self.getCharset(con)
print (charset)
try:
con = con.decode(charset)
except Exception as e:
print ('decode Error:', e)
return
return con
def parsePage(self, url):
con = self.reqPage(url)
obj = BeautifulSoup(con, 'html5lib')
div = obj.find('div', class_="containerbox boxindex")
tbody = div.find('tbody')
listtr = tbody.find_all('tr')
for tr in listtr[1:]:
tds = list(tr.stripped_strings)
ip = ':'.join(tds[:2])
print (ip)
self.pools.append(ip)
def parseArea(self, url):
print (url)
con = self.reqPage(url)
obj = BeautifulSoup(con, 'html5lib')
listpage = obj.find('div', id="PageList")
lista = listpage.find_all('a')
for a in lista[:6]:
step = a.get('href')
if step.endswith('/index'):
step = step.replace('/index', '/1.html')
self.parsePage(self.baseurl+step)
def start(self):
con = self.reqPage(self.baseurl)
obj = BeautifulSoup(con, 'html5lib')
areas = obj.find('ul', class_="textlarge22")
if areas:
lista = areas.find_all('a')
if lista:
lista = lista[1:]
for a in lista:
step = a.get('href')
if step:
self.parseArea(self.baseurl + step)
if __name__ == '__main__':
apiurl = 'http://www.66ip.cn/mo.php?sxb=&tqsl=2000&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea='
starturl ='http://www.66ip.cn'
proxyhd = GetProxy(url = starturl)
tpools = thread_pool(50)
#proxyhd.start()
proxyhd.getByApi(apiurl)
ips = proxyhd.getIps()
print (len(ips))
validips = []
for ip in ips:
tpools.add_task(chekout_proxy, ip)
stime = time.time()
tpools.start()
tpools.join()
etime = time.time()
rs = tpools.get_result()
print ('valid ips:',len(rs))
for ip in rs:
print (ip)
print (stime, etime)