-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
129 lines (112 loc) · 4.98 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/python
# -*- coding: utf-8 -*-
import bs4, urllib.request, datetime, sqlite3, json
def get_webdata(w):
#this function gets the specifics from a certain house advert and adds it to the dictionary
fundaUrl = 'http://www.funda.nl'
html = urllib.request.urlopen(fundaUrl + w['link']).read().decode('latin-1')
soup = bs4.BeautifulSoup(html,'html.parser')
r={}
#print(soup.findAll('a'))
for ad in soup.findAll(attrs={'class' : 'object-kenmerken-body'}):
for n in ad.findAll(attrs={'class' : 'object-kenmerken-list'},recursive=False):
for k in n.findAll(['dd','dt']):
#print(k)
try:
if k.find_next_sibling().name == 'dd':
r[k.text.strip()] = k.find_next_sibling().text.strip()
except AttributeError:
next
return(r)
#===== DEFINE SEARCH OPTIONS ======
MAX_PAGES = 9999999 #max number of pages to search
city = 'rotterdam'
minPrice=str(100000)
maxPrice=str(200000)
fundaUrl = 'http://www.funda.nl'
baseUrl = fundaUrl + '/koop/' + city + '/' + minPrice + '-' + maxPrice + '/' + 'bestaande-bouw' + '/'
#met garage:
#baseUrl = baseUrl + "/aangebouwde-garage/inpandige-garage/vrijstaande-garage/"
baseUrl = baseUrl + 'p'
data_file_name = 'data.json'
#===== START SCRAPE =====
current_date = datetime.datetime.now().strftime("%Y-%m-%d")
print("Welcome! Today is ", current_date)
print("Scraping ... " + baseUrl)
html = urllib.request.urlopen(baseUrl).read().decode('latin-1')
soup = bs4.BeautifulSoup(html,'html.parser')
#print(soup.prettify('latin-1'))
# get number of pages
try:
pages = soup.findAll(attrs={'class' : 'pagination-number pagination-last'})
numberOfPages = int(pages[0]['data-pagination-page'])
except IndexError:
print("IndexError when trying to find number of pages")
numberOfPages = 1
print("Pages found: " + str(numberOfPages))
if numberOfPages > MAX_PAGES:
print("More pages found than MAX_PAGES -> only processing first " + str(MAX_PAGES) + " pages.\n")
numberOfPages = MAX_PAGES
items = []
#load file if exists
try:
with open(data_file_name, 'x+') as data_file:
data_file.write('[]')
data = []
except FileExistsError:
with open(data_file_name, 'r+') as data_file:
data = json.load(data_file)
#print(json.dumps(data,indent=4))
#check each page
for pageNo in range(1, numberOfPages + 1):
nextUrl = baseUrl + str(pageNo)
print('Scraping page ' + str(pageNo) + ' - ' + nextUrl)
html = urllib.request.urlopen(nextUrl).read().decode('latin-1')
soup = bs4.BeautifulSoup(html,'html.parser')
#check ads on page
for ad in soup.findAll(attrs={'class' : 'search-result-content-inner'}):
#extract basic data from each ad
webdata = {}
try:
webdata['title'] = ad.find(attrs={'class':'search-result-title'}).contents[0].strip()
webdata['subtitle'] = ad.find(attrs={'class':'search-result-subtitle'}).contents[0].strip()
webdata['link'] = ad.find(attrs={'class':'search-result-header'}).a['href'].strip()
webdata['price'] = ad.find(attrs={'class':'search-result-price'}).contents[0].replace('â\x82¬',"").replace("k.k.","").strip().replace(".","")
except: #for any error, we just continue with the next add
print('Error getting listing. Going to next ...')
continue
try:
webdata['area'] = ad.find(attrs={'title':'Woonoppervlakte'}).contents[0].replace("m²","").strip()
except: #at an error for area, we do go on
webdata['area'] = '0'
#find if this entry exists in our json-data
adAlreadyInDatabase = False
for n in data:
#print(data)
if n['title']==webdata['title']:
adAlreadyInDatabase = True
newPrice = True
for p in n['price']:
if p['date'] == current_date:
newPrice = False
if newPrice:
print('Updating price for: ', [webdata['title'],webdata['subtitle']])
n['price'].append({'price':webdata['price'],'date': current_date})
n['last_seen'] = current_date
if not adAlreadyInDatabase:
print('Adding new entry: ',[webdata['title'],webdata['subtitle']])
#get add specific data
r = get_webdata(webdata)
#add the high level info
r['title'] = webdata['title']
r['link'] = webdata['link']
r['subtitle'] = webdata['subtitle']
r['price'] = [{'price':webdata['price'],'date': current_date}]
r['area'] = webdata['area']
r['last_seen'] = current_date
data.append(r)
#to be sure, we write our data every page so we don't loose too much
with open(data_file_name, 'w') as data_file:
print('Writing to file ...')
data_file.write(json.dumps(data,indent=4))
#export the JSON