-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathupdate_blocket.py
270 lines (195 loc) · 7.52 KB
/
update_blocket.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
"""
Module that inhabits some methods to download cars from blocket.se
"""
#!/usr/bin/python3.6
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import re
import pandas as pd
from collections import OrderedDict
import numpy as np
import os.path
import logging
from logging import handlers
import sys
log = logging.getLogger('')
log.setLevel(logging.INFO)
format = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
ch = logging.StreamHandler(sys.stdout)
ch.setFormatter(format)
log.addHandler(ch)
fh = handlers.RotatingFileHandler('blocket.log', maxBytes=(1048576*5), backupCount=7)
fh.setFormatter(format)
log.addHandler(fh)
import warnings
warnings.filterwarnings('ignore')
def simple_get(url):
"""
Attempts to get the content at `url` by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
It is always a good idea to log errors.
This function just prints them, but you can
make it do anything.
"""
print(e)
def clean_string(s):
s_clean = s.replace('\n', '').replace('\t', '')
return s_clean
def clean_string2(s):
s_clean = clean_string(s).replace(' ', '')
return s_clean
def clean_price(s):
s_clean = clean_string(s)
s_clean = s_clean.replace('kr', '').replace(' ', '')
price = float(s_clean)
return price
def find_id_from_href(href):
result = re.search(pattern=r'(\d*).htm', string=href)
id = int(result.groups(1)[0])
return id
def clean_horsepower(s_horsepower):
result = re.search('\d+', s_horsepower)
if result:
return int(result.group(0))
else:
return np.nan
def get_extra_data(html):
extra_data = html.find('dl', attrs={'class': 'col-xs-12 motor-extradata-details'})
if extra_data:
key_items = extra_data.findAll('dt')
value_items = extra_data.findAll('dd')
data_extra = pd.Series()
for key_item, value_item in zip(key_items, value_items):
key = key_item.text
value = value_item.text
data_extra[key] = value
if 'Hästkrafter' in data_extra:
s_horsepower = data_extra['Hästkrafter']
data_extra['Hästkrafter'] = clean_horsepower(s_horsepower=s_horsepower)
return data_extra
else:
return None
def parse_car(href):
raw_html = simple_get(href)
html = BeautifulSoup(raw_html, 'html.parser')
header = html.find('h1')
name = clean_string(header.text)
item_details = html.find('div', attrs={'id': 'item_details'})
items = item_details.find_all('dl', attrs={'class': 'col-xs-4'})
data = pd.Series()
for item in items:
key = clean_string2(item.find('dt').text)
value = clean_string2(item.find('dd').text)
data[key] = value
data['header'] = name
price = html.find('div', attrs={'id': 'vi_price'})
data['price'] = clean_price(price.text)
extra_data = get_extra_data(html=html)
if not extra_data is None:
all_data = pd.concat([data,extra_data])
else:
all_data = data
all_data.name = find_id_from_href(href=href)
return all_data
def get_cars(car_path, max_cars=None):
next_page_href = car_path
df_cars = pd.DataFrame()
counter = 0
while not next_page_href is None:
raw_html = simple_get(url=next_page_href)
html = BeautifulSoup(raw_html, 'html.parser')
item_list = html.find_all('div', attrs={'class': 'styled__Wrapper-sc-1kpvi4z-0 itHtzm'})
for item in item_list:
if not max_cars is None:
if counter >= max_cars:
return df_cars
href = item.find('a', attrs={'class': 'styled__Wrapper-sc-1kpvi4z-0 itHtzm'}).get('to')
try:
s_car = parse_car(href=href).copy()
except AttributeError:
logging.warning('could not parse car:%s' % href)
continue
else:
a = item.find('div', attrs={'class': 'pull-left'})
place = a.contents[-1]
s_car['place'] = place
s_car['href'] = href
df_cars = df_cars.append(s_car)
counter += 1
next_page = html.find('a', attrs={'class': 'page_nav'}, text='\n Nästa sida »\n ')
if next_page is None:
next_page_href = None
else:
next_page_href = r'https://www.blocket.se/hela_sverige' + next_page['href']
return df_cars
def load_from_blocket(car_paths,max_cars = None):
logging.info('\n\n____________ Starting to load from blocket.se _____________')
df_cars = pd.DataFrame()
car_counter = 0
for car_type, car_path in car_paths.items():
logging.info('Loading car type:%s from:%s' % (car_type,car_path))
if max_cars is None:
max_cars_left = max_cars
else:
max_cars_left = max_cars - car_counter
df_car_type_cars = get_cars(car_path=car_path,max_cars=max_cars_left)
car_counter+=len(df_car_type_cars)
df_car_type_cars['car type'] = car_type
df_cars = df_cars.append(df_car_type_cars)
logging.info('%i cars have been succesfully loaded today' % len(df_cars))
return df_cars
def combine_new_and_old(df_cars,file_path = 'cars.csv'):
logging.info('Combining with old data...')
try:
old_cars = pd.read_csv(file_path, sep=';', index_col=0)
except:
pass
else:
df_cars = df_cars.combine_first(old_cars)
return df_cars
def save(df_cars,file_path = 'cars.csv'):
path = __file__
directory = os.path.split(path)[0]
save_path = os.path.join(directory,file_path)
df_cars.to_csv(save_path, sep=';')
logging.info('All data has been saved to:%s' % save_path)
_,publish_name = file_name = os.path.split(file_path)
name,ext = os.path.splitext(publish_name)
publish_file_name = '%s_publish%s' % (name,ext)
save_path_publish = os.path.join(directory, publish_file_name)
df_cars.to_csv(save_path_publish, sep=',')
logging.info('All data has also been saved to:%s' % save_path_publish)
if __name__ == '__main__':
car_paths = OrderedDict()
car_paths[
'kangoo'] = r'https://www.blocket.se/hela_sverige?q=&cg=1020&w=3&st=s&ps=&pe=&mys=&mye=&ms=&me=&cxpf=&cxpt=&fu=&gb=&ccco=1&ca=15&is=1&l=0&md=th&cp=&cb=30&cbl1=4'
car_paths[
'berlingo'] = r'https://www.blocket.se/hela_sverige?q=&cg=1020&w=3&st=s&ps=&pe=&mys=&mye=&ms=&me=&cxpf=&cxpt=&fu=&gb=&ca=15&is=1&l=0&md=th&cp=&cb=7&cbl1=1'
car_paths[
'caddy'] = r'https://www.blocket.se/hela_sverige?q=&cg=1020&w=3&st=s&ps=&pe=&mys=&mye=&ms=&me=&cxpf=&cxpt=&fu=&gb=&ccco=1&ca=15&is=1&l=0&md=th&cp=&cb=40&cbl1=2'
df_cars = load_from_blocket(car_paths=car_paths,max_cars=None)
df_cars = combine_new_and_old(df_cars=df_cars)
save(df_cars)