-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_open_data.py
79 lines (59 loc) · 2.97 KB
/
get_open_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
"""
Function to get a csv file from Ckan site and convert it to Pandas dataframe.
Developed by : Nelson Rossi Bittencourt
GitHub : https://github.com/NelsonBittencourt
Version : 0.11
"""
import pandas as pd
import requests
def get_ckan_data(open_data_URL, res_name, res_item=0, res_type='csv', res_encoding='utf-8',res_sep=';',res_decimal='.'):
"""
Gets data from a generic CKAN site usinge resource name.
Allocs data in a Panda Dataframe (function return type).
For more information see Ckan documentation: Ver documentação da API: https://docs.ckan.org/
Its mandatory knows resource name.
Arguments:
: open_data_URL - URL to ckan base site that contais 'resource_name';
: res_name - resource name;
: res_item - (optional, default=0) use if exists multiple files 'into' a 'resource_name';
: res_type - (optional, default='csv') specifies file type 'csv' or 'xlsx' to converto do dataframe;
: res_encoding - (optional, default='utf-8') use to avoid errors in csv to Pandas dataframe conversion (default:'utf-8');
: res_separator - (optional, default=';') use to define csv separator character and
: res_decimal - (optional, default='.') use to define csv decimal character.
Returns:
: Pandas dataframe - resource exists and
: -1 - fails to reach resource.
Needs:
: requests, pandas.
Version: 0.11
"""
# Ckan search resource URL, based on open_data_URL provided by user.
ckan_URL = open_data_URL + '/api/3/action/resource_search'
# Dictionary to pass parameters to Ckan_URL (query for resource)
ckam_params = { 'query' : 'name:' + res_name}
# Requisição ao site.
r = requests.get(ckan_URL, params = ckam_params)
# Check request for success or error.
if (r.status_code!=200):
print('Error trying to access resource {} in {}. [Reason ={}]'.format(res_name,open_data_URL,r.reason))
return -1
# Check if resource was found.
if (r.json()['result']['count']==0):
print('Error trying to access resource {} in {}. [Reason = resource not found!]'.format(res_name,open_data_URL))
return -1
# Get URL file from resource data.
data_url = r.json()['result']['results'][res_item]['url']
# Try to convert resource file to Pandas dataframe.
try:
if (res_type=='csv'):
df = pd.read_csv(data_url, sep=res_sep, header=0, decimal=res_decimal,encoding=res_encoding)
elif (res_type=='xlsx'):
df = pd.read_excel(data_url, decimal=res_decimal)
else:
print("The option 'resource_type' must be 'csv' or 'xlsx'")
except Exception as e:
print('Error trying to convert csv to dataframe. Error: {}'.format(e))
return -1
# Returns raw dataframe to user.
return df