-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
103 lines (81 loc) · 3.55 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import urllib.request
from bs4 import BeautifulSoup
import db
import re
import progressbar as p
import time
import sys
# Replace null string with ''.
def xstr(s):
if s is None:
return ''
return str(s)
# Download a page and parse it with BeautifulSoup
def download_url(url):
# Retrieve the first main site.
opener = urllib.request.FancyURLopener({})
f = opener.open(url)
content = f.read()
return BeautifulSoup(content, "html.parser")
print("""
___ _ _ _ _ _ _
|_ | | | (_) | | | | | | | |
| | __ _ _ __ | | _ _ __ __| | ___ _ __ ___ | | | | ___| |__ ___ ___ _ __ __ _ _ __ ___ _ __
| |/ _` | '_ \| | | | '_ \ / _` |/ _ \ '__/ __| | |/\| |/ _ \ '_ \/ __|/ __| '__/ _` | '_ \ / _ \ '__|
/\__/ / (_| | | | | |___| | | | | (_| | __/ | \__ \ \ /\ / __/ |_) \__ \ (__| | | (_| | |_) | __/ |
\____/ \__,_|_| |_\_____/_|_| |_|\__,_|\___|_| |___/ \/ \/ \___|_.__/|___/\___|_| \__,_| .__/ \___|_|
| |
|_| """)
# Start timer.
start = time.time()
# Get the main page.
soup = download_url("http://www.janlinders.nl/ons-assortiment.html")
# Get all container that has all group links.
catalog_subnav = soup.find("div", {"id": "catalog_subnav"})
catalogs = catalog_subnav.findAll("li")
# Get all group links.
links = []
for i in range(0, len(catalogs)):
for link in catalogs[i].findAll("a", href=True):
links.append(str.format("http://www.janlinders.nl/" + link['href']))
# Database instance.
try:
db = db.Database()
except:
print("Couldn't create database instance, are you sure you have the database configured and running?")
sys.exit()
for j in range(0, len(links)):
# Print progress.
p.Progressbar.print_progress(j, len(links), prefix="Progress:", suffix="Complete", bar_length=50)
# Download the current page.
soup = download_url(links[j])
# Find all products on the page.
mydivs = soup.findAll("div", {"class": "item_container"})
# Get the details for each product.
for i in range(0, len(mydivs)):
# Get the group name.
group = soup.find("h1").string
# Get the name.
name = mydivs[i].findAll(['title', 'a'])[1].get('title').rstrip()
# Get the brand.
brand = xstr(mydivs[i].find('span', {"class": "teaser"}).string).rstrip()
# Get the weight.
weight = mydivs[i].find('span', {"class": "inhoud"}).string.rstrip()
# Get the price.
pricebig = mydivs[i].find("span", {"class": "big"}).string
if re.match("^\d+\.\d+$", pricebig):
# Format of the string is something like "15.24"
matches = re.split("\.", pricebig)
pricesmall = int(matches[1])
pricebig = int(matches[0])
else:
# Price is split up in two elements.
pricebig = int(pricebig)
pricesmall = int(mydivs[i].find("span", {"class": "small"}).string)
price = round(pricebig + (pricesmall / 100), 2)
# To view the items that are scraped uncomment the next line.
# print(name, price, group)
db.insert(name, price, brand, weight, group)
# Show the total time.
end = time.time()
print(str.format("Total time elapsed: {0} seconds", round(end - start)))