-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
94 lines (77 loc) · 3.46 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from bs4 import BeautifulSoup
import requests
import csv
import time
# Parse through each category links and category names and store in a list
category_links = []
main_url = 'https://www.coles.com.au/browse'
main_html = requests.get(main_url).text
soup_categories = BeautifulSoup(main_html, "lxml")
category_tiles = soup_categories.find("ul", class_="sc-fc107670-0")
for tile in category_tiles:
category_link = tile.find("a", attrs={'data-testid': 'category-card'}).get('href')
category_name = tile.find("a", attrs={'data-testid': 'category-card'}).get('aria-label')
category_links.append((category_link, category_name))
def retrieve_data(html, category_name):
html_text = html.text
soup = BeautifulSoup(html_text, 'lxml')
# filter out the products with no prices
non_ads = soup.find_all('section', attrs={"data-testid":'product-tile'})
# parse through all product tiles, retrieve info and store in lists
product_names, product_prices, product_links, product_codes, category_names = [], [], [], [], []
for tile in non_ads:
product_name = tile.find('h2', class_ = 'product__title').text.rstrip()
# check and add tiles which do not have a price or are out of stock
try:
product_price = tile.find('span', class_ = 'price__value').text
except AttributeError:
print(f'Check {product_name}, {page_num}')
product_names.append(product_name)
product_prices.append('Out of Stock')
product_links.append('Out of Stock')
product_codes.append('Out of Stock')
category_names.append(category_name)
continue
product_link ='coles.com.au' + tile.find('a', class_ = 'product__link').get("href")
product_code = product_link.split('-')[-1]
product_names.append(product_name)
product_prices.append(product_price)
product_links.append(product_link)
product_codes.append(product_code)
category_names.append(category_name)
# Store data in csv
data = list(zip(product_names, product_prices, product_codes, product_links, category_names))
if len(data) == 0:
return 0
else:
return data
data = []
BASE_URL = 'https://www.coles.com.au'
for category_link, category_name in category_links:
page_num = 1
while True:
URL = BASE_URL + category_link + '?page=' + f'{page_num}'
html = requests.get(URL)
# track progress and debug
print(f'Currently scraping {page_num}, from {category_name}')
print(f'Response code is : {html.status_code}')
data_found = retrieve_data(html, category_name)
# Send to sleep for 2 minutes to prevent access denial.
if html.status_code == 403:
time.sleep(120)
if html.status_code != 200 or data_found == 0:
break
data += data_found
page_num += 1
time.sleep(2)
# write data as a csv format
with open('coles_data.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Product Name', 'Product Price', 'Product Code', 'Product Link', 'Category'])
writer.writerows(data)
print("Done successfully.")
# to do:
# 1. refactor code into more reusable functions
# 2. add more error handling
# 3. add more comments
# 4. add more data to be scraped (e.g. product description, product image, etc.)