forked from ljleppan/py3-opendata
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstatfi_px_api.py
130 lines (108 loc) · 4.51 KB
/
statfi_px_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
This module contains tools to load information about PC Axis files in
Statistics Finland's databases and download the files from the databases
using the open data API:
http://www.stat.fi/org/lainsaadanto/avoin_data_en.html
For license see LICENSE document
"""
import os, csv, datetime, urllib.request, urllib.parse, urllib.error, zlib, time
class PxInfo(object):
"""
A simple object representation of PX information in
Statistics Finland's open data API:
"""
_timeformat = '%Y-%m-%d %H:%M' #Just a cache place for dateformat
def __init__(self, pathname, filesize, fileupdate, tablesize, languagecode, variables, title, *args):
self.path = pathname.strip()
self.size = filesize.strip()
self.updated = last_updated.strip()
self.variables = variables.strip()
self.tablesize = tablesize.strip()
self.language = languagecode.strip()
self.title = title.strip()
def __str__(self):
return 'PX file %s: %s' % (self.path, self.title)
def __repr__(self):
return str(self)
@property
def created_dt(self):
return datetime.datetime.strptime(self.created, self._timeformat)
@property
def updated_dt(self):
return datetime.datetime.strptime(self.updated, self._timeformat)
def list_available_px(url='http://pxnet2.stat.fi/database/StatFin/StatFin_rap.csv'):
"""
Creates a list of Px-objects from a given url. Url should point to a CSV file.
Url's default value points to Statfin databases contents CSV.
"""
response = urllib.request.urlopen(url)
lines = iter(response.read().decode('utf-8').splitlines())
next(lines) # Skip headers
pxs = []
for line in csv.reader(lines, delimiter=";"):
pxs.append(PxInfo(line[:6], line[15]))
return [PxInfo(*i) for i in csv.reader(lines, delimiter=";")]
def download_px(px_objs, target_dir='.', compressed=False, sleep=1, refresh='check'):
"""
Fetch PC Axis files for given list of Px objects
Save the files to target directory
WARNING: Statfin database contains over 2500 PX files with many gigabytes of data.
"""
refresh_options = ['never', 'check', 'always']
if refresh not in refresh_options:
raise ValueError('Invalid value for refresh, must be one of "{}"'.format('", "'.join(refresh_options)))
if not isinstance(px_objs, list):
px_objs = [px_objs]
for px_obj in px_objs:
url_parts = urllib.parse.urlparse(px_obj.path)
target_path = os.path.join(target_dir, url_parts.path[1:]) # url_parts.path starts with '/'
target_path = os.path.abspath(target_path)
if refresh != "always" and os.path.exists(target_path):
if refresh == 'check':
if is_latest(px_obj.path, target_path):
print('File {} is already latest, skipping'.format(target_path))
time.sleep(1)
continue
elif refresh == 'never':
print('File {} already exists, skipping'.format(target_path))
continue
print('Downloading file from {} ...'.format(px_obj.path), end=' ')
try:
request = urllib.request.Request(px_obj.path)
if compressed:
request.add_header('Accept-encoding', 'gzip')
response = urllib.request.urlopen(request)
except urllib.error.HTTPError as e:
print('ERROR:', e)
print('Response headers:', e.headers)
time.sleep(sleep)
continue
makedirs(target_path)
try:
with open(target_path, 'wb') as f:
data = response.read()
if compressed:
data = zlib.decompress(data, zlib.MAX_WBITS|16)
f.write(data)
except IOError as e:
print('ERROR:', e)
time.sleep(sleep)
continue
print('done')
time.sleep(sleep)
def is_latest(url, file_path):
"""
Check that network resource is newer than file resource
"""
try:
response = urllib.request.urlopen(urllib.request.Request(url, method='HEAD'))
file_mtime_dt = datetime.datetime.fromtimestamp(os.path.getmtime(file_path))
url_modified_dt = datetime.datetime.strptime(response.getheader('last-modified'), '%a, %d %b %Y %H:%M:%S GMT')
return url_modified_dt < file_mtime_dt
except urllib.error.HTTPError as e:
return True
def makedirs(px_path):
try:
os.makedirs(os.path.dirname(px_path))
except OSError as e:
pass