-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper_n_e_s.py
341 lines (315 loc) · 12 KB
/
scraper_n_e_s.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# Import general libraries
import datetime
import os
from bs4 import BeautifulSoup as soup
import time
import requests
requests.packages.urllib3.disable_warnings()
import random
# Improt Selenium packages
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException as NoSuchElementException
from selenium.common.exceptions import WebDriverException as WebDriverException
from selenium.common.exceptions import ElementNotVisibleException as ElementNotVisibleException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
def request_page(url_string, verification, robust):
"""HTTP GET Request to URL.
Args:
url_string (str): The URL to request.
verification: Boolean certificate is to be verified
robust: If to be run in robust mode to recover blocking
Returns:
HTML code
"""
if robust:
loop = False
first = True
# Scrape contents in recovery mode
c = 0
while loop or first:
first = False
try:
uclient = requests.get(url_string, timeout = 60, verify = verification)
page_html = uclient.text
loop = False
return page_html
except requests.exceptions.ConnectionError:
c += 10
print("Request blocked, .. waiting and continuing...")
time.sleep(random.randint(10,60) + c)
loop = True
continue
except (requests.exceptions.ReadTimeout,requests.exceptions.ConnectTimeout):
print("Request timed out, .. waiting one minute and continuing...")
time.sleep(60)
loop = True
continue
else:
uclient = requests.get(url_string, timeout = 60, verify = verification)
page_html = uclient.text
loop = False
return page_html
def request_page_fromselenium(url_string, driver, robust):
""" Request HTML source code from Selenium web driver to circumvent mechanisms
active with HTTP requests
Args:
Selenium web driver
URL string
Returns:
HTML code
"""
if robust:
loop = False
first = True
# Scrape contents in recovery mode
c = 0
while loop or first:
first = False
try:
open_webpage(driver, url_string)
time.sleep(5)
page_html = driver.page_source
loop = False
return page_html
except WebDriverException:
c += 10
print("Web Driver problem, .. waiting and continuing...")
time.sleep(random.randint(10,60) + c)
loop = True
continue
else:
open_webpage(driver, url_string)
time.sleep(5)
page_html = driver.page_source
loop = False
return page_html
def set_driver(webdriverpath, headless):
"""Opens a webpage in Chrome.
Args:
url of webpage.
Returns:
open and maximized window of Chrome with webpage.
"""
options = Options()
if headless:
options.add_argument("--headless")
elif not headless:
options.add_argument("--none")
return webdriver.Chrome(webdriverpath, chrome_options = options)
def create_object_soup(object_link, verification, robust):
""" Create page soup out of an object link for a product
Args:
Object link
certificate verification parameter
robustness parameter
Returns:
tuple of beautiful soup object and object_link
"""
object_soup = soup(request_page(object_link, verification, robust), 'html.parser')
return (object_soup, object_link)
def make_soup(link, verification):
""" Create soup of listing-specific webpage
Args:
object_id
Returns:
soup element containing listings-specific information
"""
return soup(request_page(link, verification), 'html.parser')
def reveal_all_items(driver):
""" Reveal all items on the categroy web page of Albert Heijn by clicking "continue"
Args:
Selenium web driver
Returns:
Boolean if all items have been revealed
"""
hidden = True
while hidden:
try:
time.sleep(random.randint(5,7))
driver.find_element_by_css_selector('section#listing-home div.col-md-6.customlistinghome > a').click()
except (NoSuchElementException, ElementNotVisibleException):
hidden = False
return True
def open_webpage(driver, url):
"""Opens web page
Args:
web driver from previous fct and URL
Returns:
opened and maximized webpage
"""
driver.set_page_load_timeout(60)
driver.get(url)
driver.maximize_window()
def find_correct_css_element(pagination_container):
""" Finds current position in pagination container
and returns right CSS number to click on.
Args:
BS4 pagination container
Returns:
Integer for correct clicking
"""
container = pagination_container[1:len(pagination_container)-1]
for i in container:
try:
i.a['href']
continue
except TypeError:
pos = container.index(i)
return pos + 2
def click_page_forward(driver, counter, pagecount, pagination_length):
""" Click pages forward to access PDF files on individual page
Args:
Pagecount
web driver
counter parameter
pagination_length parameter
Returns:
Next page using web driver and resetted counter
"""
# Extract page attributes
page_html = driver.page_source
page_soup = soup(page_html, 'html.parser')
pagination_container = page_soup.findAll('table', {'class': 'mGrid'})[0].tbody.findAll('tr', {'align': 'center'},
{'style': 'color:White;background-color:#2461BF;'})[0].findAll('td')
# Extract length of pagination and last element
last_element = pagination_container[len(pagination_container)-1].text
pagination_length_old = pagination_length
pagination_length
# Extract information on current pagination
last_element = pagination_container[len(pagination_container)-1].text
pagination_length = len(pagination_container) - 1
print("Last element:", last_element)
# Differentiate cases: Once page break is reached, check for last element
if counter <= pagination_length_old:
counter = find_correct_css_element(pagination_container)
else:
if last_element == "...":
counter = find_correct_css_element(pagination_container)
else:
try:
assert int(pagecount) <= int(last_element)
counter = find_correct_css_element(pagination_container)
except AssertionError:
return (False, counter, pagination_length) ##### CHANGE THIS to return
try:
driver.find_element_by_css_selector('table#ContentPlaceHolder1_gwVLPListimi tr:nth-child(12) > td > table > tbody > tr > td:nth-child(' + str(counter) +') > a').click()
# Check pagecount
return (True, counter, pagination_length)
except NoSuchElementException:
return (False, counter, pagination_length)
def check_item_number(page_html):
""" Check number of listings available in PDF grid
Args:
HTML page
Returns:
Number of items + 1
"""
page_soup = soup(page_html, 'html.parser')
items_container = page_soup.findAll('table', {'class': 'mGrid'})[0].tbody.findAll('tr', {'style': 'background-color:#EFF3FB;'})
items_container = items_container + page_soup.findAll('table', {'class': 'mGrid'})[0].tbody.findAll('tr', {'style': 'background-color:White;'})
return len(items_container)
def scrape_n_e_s_a(base_url, robust, driver, output_path, now_str):
""" Extract item URL links and return list of all item links on web page
Args:
Base URL
Categroy tuples
Certificate verification parameter
Robustness parameter
Selenium web driver
Returns:
Dictionary with item URLs
"""
pagination_ongoing = True
counter = 0
pagecount = 0
pagination_length = 11
first_run= True
# Create folder
now_folder = output_path + now_str + "\\"
os.mkdir(now_folder)
#Start PDF extraction into folder
print("Start retrieving PDF documents ...")
# Open first webpage
open_webpage(driver, base_url)
# Loop over pages by checking if next page is available
while pagination_ongoing:
pagecount += 1
counter += 1
# Wait 1 sec
time.sleep(3)
# Within each page extract all links
print("I am on page", pagecount,"Counter =", counter, "Pagination length", pagination_length)
if not first_run:
# Click onwards
click_tuple = click_page_forward(driver, counter, pagecount, pagination_length)
pagination_ongoing = click_tuple[0]
# Reset counter f necessary
counter = click_tuple[1]
# Reset paination_length of current block
pagination_length = click_tuple[2]
if pagination_ongoing == False:
print("Reached end, breaking out from click routine...")
break
else:
pass
else:
pass
# Loop over items in page
page_html = driver.page_source
items = check_item_number(page_html)
for item in range(0, items):
print("Items:", items, "current item", item)
now_substr = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
time.sleep(random.randint(2,4))
print("Extracting PDF", item +1, "on page", pagecount, "...")
# Click download button
driver.find_element_by_css_selector('#ContentPlaceHolder1_gwVLPListimi_lnkOpenProfile_' + str(item)).click()
driver.switch_to.window(driver.window_handles[-1])
# Extract url of PDF
pdf_url = driver.current_url
# Save pdf locally to path
url_id_container = pdf_url.split('/')
url_id = url_id_container[len(url_id_container)-1].replace('.pdf', '')
outfile = now_folder + url_id + "_" + now_substr + ".pdf"
# Get response code
response = requests.get(pdf_url, timeout = 60)
# Extract file dpending on the response
if response.status_code == 200:
with open(outfile, 'wb') as f:
f.write(response.content)
else:
print("Skipping PDF..")
pass
# Close tab
#driver.close()
# switch to main window
driver.switch_to.window(driver.window_handles[0])
first_run = False
def main():
""" Note: Set parameters in this function
"""
# Set time stamp
now_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# Set scraping parameters
base_url = 'http://www.puna.gov.al/VLPDisplay.aspx'
robust = True
webdriverpath = "C:\\Users\\Calogero\\Documents\\GitHub\\job_portal_web_bot\\chromedriver.exe"
# Set outpath for PDF files
output_path = "C:\\Users\\Calogero\\Documents\\GitHub\\job_portal_web_bot\\data\\daily_scraping\\"
# Set up a web driver
driver = set_driver(webdriverpath, False)
# Start timer
start_time = time.time() # Capture start and end time for performance
# Execute functions for scraping
scrape_n_e_s_a(base_url, robust, driver, output_path, now_str)
driver.quit()
end_time = time.time()
duration = time.strftime("%H:%M:%S", time.gmtime(end_time - start_time))
# For interaction and error handling
final_text = "Your query was successful! Time elapsed:" + str(duration)
print(final_text)
time.sleep(0.5)
# Execute scraping
if __name__ == "__main__":
main()