-
Notifications
You must be signed in to change notification settings - Fork 0
/
proxiesPool.py
152 lines (129 loc) · 6.66 KB
/
proxiesPool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import requests
from bs4 import BeautifulSoup
from itertools import cycle
from fake_useragent import UserAgent, FakeUserAgentError
import random
import asyncio
from logger import Logger
import time
class proxiesPool:
def __init__(self):
self.headers = None
self.dicti()
self.current_proxy = None
self.current_headers = None
self.headers_pool = None
self.proxies = None
self.headers = None
self.logger = Logger().logger
#self.proxies_pool = None
def dicti(self):
# Create a dict of accept headers for each user-agent.
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"}
# Get a random user-agent. We used Chrome and Firefox user agents.
# Getting a user agent using the fake_useragent package
ua = UserAgent()
if random.random() > 0.5:
random_user_agent = ua.chrome
else:
random_user_agent = ua.firefox
valid_accept = accepts['Firefox'] if random_user_agent.find('Firefox') > 0 else accepts['Safari, Chrome']
self.headers = {"User-Agent": random_user_agent,
"Accept": valid_accept}
def proxies_pool(self):
url = 'https://www.sslproxies.org/'
# Retrieve the site's page. The 'with'(Python closure) is used here in order to automatically close the session when done
with requests.Session() as res:
proxies_page = res.get(url)
# Create a BeutifulSoup object and find the table element which consists of all proxies
soup = BeautifulSoup(proxies_page.content, 'html.parser')
proxies_table = soup.find(id='proxylisttable')
# Go through all rows in the proxies table and store them in the right format (IP:port) in our proxies list
proxies = []
for row in proxies_table.tbody.find_all('tr'):
proxies.append('{}:{}'.format(row.find_all('td')[0].string, row.find_all('td')[1].string))
return proxies
def random_header(self,logger):
# Create a dict of accept headers for each user-agent.
accepts = {"Firefox": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Safari, Chrome": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"}
# Get a random user-agent. We used Chrome and Firefox user agents.
# Take a look at fake-useragent project's page to see all other options - https://pypi.org/project/fake-useragent/
try:
# Getting a user agent using the fake_useragent package
ua = UserAgent()
if random.random() > 0.5:
random_user_agent = ua.chrome
else:
random_user_agent = ua.firefox
# In case there's a problem with fake-useragent package, we still want the scraper to function
# so there's a list of user-agents that we created and swap to another user agent.
# Be aware of the fact that this list should be updated from time to time.
# List of user agents can be found here - https://developers.whatismybrowser.com/.
except FakeUserAgentError as error:
# Save a message into a logs file. See more details below in the post.
print("[System]:FakeUserAgent didn't work. Generating headers from the pre-defined set of headers. error: {}".format(
error))
logger.error(
"FakeUserAgent didn't work. Generating headers from the pre-defined set of headers. error: {}".format(
error))
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"] # Just for case user agents are not extracted from fake-useragent package
random_user_agent = random.choice(user_agents)
# Create the headers dict. It's important to match between the user-agent and the accept headers as seen in line 35
finally:
valid_accept = accepts['Firefox'] if random_user_agent.find('Firefox') > 0 else accepts['Safari, Chrome']
headers = {"User-Agent": random_user_agent,
"Accept": valid_accept}
return headers
# Generate the pools
def create_pools(self):
self.proxies = self.proxies_pool()
self.headers = [self.random_header(self.logger) for ind in range(len(self.proxies))] # list of headers, same length as the proxies list
# This transforms the list into itertools.cycle object (an iterator) that we can run
# through using the next() function in lines 16-17.
proxies_pool = cycle(self.proxies)
headers_pool = cycle(self.headers)
return proxies_pool, headers_pool
# Usage example
def getProxies(self):
proxies_pool, headers_pool = self.create_pools()
self.current_proxy = next(proxies_pool)
self.current_headers = next(headers_pool)
return self.current_proxy, self.current_headers
def changeProxies(self):
return next(self.current_proxy) , next(self.current_headers)
# def testProxies(self,link="https://www.n12.co.il/"):
# with requests.Session() as res:
# try:
# page = res.get(link, proxies={"http": self.current_proxy, "https": self.current_proxy},
# headers=self.current_headers, timeout=30)
# except Exception as e:
# print("Error,chaneProxies()", e)
# time.sleep(5)
# next(self.current_proxy), next(self.current_headers)
# self.changeProxies()
#
# if page != "<Response [200]>":
# print("Error,chaneProxies() , Unable to connect via this proxie")
# time.sleep(10)
# next(self.current_proxy), next(self.current_headers)
# self.changeProxies()
#
# a = proxiesPool()
# current_proxy ,current_headers = a.getProxies()
#
# # Introduce the proxy and headers in the GET request
# link = "https://www.n12.co.il/"
#
# with requests.Session() as req:
# page = req.get(link, proxies={"http": current_proxy, "https": current_proxy},
# headers=current_headers, timeout=30)
# print(page)
#
#
# # get all Beautifulsoup objects of all retrieved pages
# soups = [BeautifulSoup(pages[ind].content, 'html.parser') if
# pages[ind].status_code == 200 else "problem" for ind in range(len(pages))]