Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: IO World Bank WDI #2592

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ language: python
python:
- 2.6
- 2.7
- 3.1 # travis will soon EOL this
# - 3.1 # travis EOL
- 3.2
- 3.3

Expand Down Expand Up @@ -45,8 +45,10 @@ before_install:
install:
- echo "Waldo2"
- ci/install.sh
- ci/print_versions.py # not including stats

script:
- echo "Waldo3"
- ci/script.sh

after_script:
- ci/print_versions.py
32 changes: 16 additions & 16 deletions ci/before_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,20 @@ fi

sudo apt-get update $APT_ARGS # run apt-get update for all versions

# hack for broken 3.3 env
if [ x"$VIRTUAL_ENV" == x"" ]; then
VIRTUAL_ENV=~/virtualenv/python$TRAVIS_PYTHON_VERSION_with_system_site_packages;
fi
# # hack for broken 3.3 env
# if [ x"$VIRTUAL_ENV" == x"" ]; then
# VIRTUAL_ENV=~/virtualenv/python$TRAVIS_PYTHON_VERSION_with_system_site_packages;
# fi

# we only recreate the virtualenv for 3.x
# since the "Detach bug" only affects python3
# and travis has numpy preinstalled on 2.x which is quicker
_VENV=$VIRTUAL_ENV # save it
if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ] ; then
deactivate # pop out of any venv
sudo pip install virtualenv==1.8.4 --upgrade
sudo apt-get install $APT_ARGS python3.3 python3.3-dev
sudo rm -Rf $_VENV
virtualenv -p python$TRAVIS_PYTHON_VERSION $_VENV --system-site-packages;
source $_VENV/bin/activate
fi
# # we only recreate the virtualenv for 3.x
# # since the "Detach bug" only affects python3
# # and travis has numpy preinstalled on 2.x which is quicker
# _VENV=$VIRTUAL_ENV # save it
# if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ] ; then
# deactivate # pop out of any venv
# sudo pip install virtualenv==1.8.4 --upgrade
# sudo apt-get install $APT_ARGS python3.3 python3.3-dev
# sudo rm -Rf $_VENV
# virtualenv -p python$TRAVIS_PYTHON_VERSION $_VENV --system-site-packages;
# source $_VENV/bin/activate
# fi
2 changes: 1 addition & 1 deletion ci/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fi
if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ] || \
[ ${TRAVIS_PYTHON_VERSION} == "3.1" ] || \
[ ${TRAVIS_PYTHON_VERSION} == "3.2" ]; then
pip $PIP_ARGS install numpy; #https://github.com/y-p/numpy/archive/1.6.2_with_travis_fix.tar.gz;
pip $PIP_ARGS install numpy;
else
pip $PIP_ARGS install https://github.com/numpy/numpy/archive/v1.7.0b2.tar.gz;
fi
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3240,7 +3240,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,

Parameters
----------
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
Method to use for filling holes in reindexed Series
pad / ffill: propagate last valid observation forward to next valid
backfill / bfill: use NEXT valid observation to fill gap
Expand Down
30 changes: 30 additions & 0 deletions pandas/io/tests/test_wb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pandas
from pandas.util.testing import network
from pandas.util.testing import assert_frame_equal
from numpy.testing.decorators import slow
from pandas.io.wdi import (search, download)

@slow
@network
def test_wdi_search():
expected = {u'id': {2634: u'GDPPCKD',
4649: u'NY.GDP.PCAP.KD',
4651: u'NY.GDP.PCAP.KN',
4653: u'NY.GDP.PCAP.PP.KD'},
u'name': {2634: u'GDP per Capita, constant US$, millions',
4649: u'GDP per capita (constant 2000 US$)',
4651: u'GDP per capita (constant LCU)',
4653: u'GDP per capita, PPP (constant 2005 international $)'}}
result = search('gdp.*capita.*constant').ix[:,:2]
assert_frame_equal(result, pandas.DataFrame(expected))

@slow
@network
def test_wdi_download():
expected = {'GDPPCKN': {(u'United States', u'2003'): u'40800.0735367688', (u'Canada', u'2004'): u'37857.1261134552', (u'United States', u'2005'): u'42714.8594790102', (u'Canada', u'2003'): u'37081.4575704003', (u'United States', u'2004'): u'41826.1728310667', (u'Mexico', u'2003'): u'72720.0691255285', (u'Mexico', u'2004'): u'74751.6003347038', (u'Mexico', u'2005'): u'76200.2154469437', (u'Canada', u'2005'): u'38617.4563629611'}, 'GDPPCKD': {(u'United States', u'2003'): u'40800.0735367688', (u'Canada', u'2004'): u'34397.055116118', (u'United States', u'2005'): u'42714.8594790102', (u'Canada', u'2003'): u'33692.2812368928', (u'United States', u'2004'): u'41826.1728310667', (u'Mexico', u'2003'): u'7608.43848670658', (u'Mexico', u'2004'): u'7820.99026814334', (u'Mexico', u'2005'): u'7972.55364129367', (u'Canada', u'2005'): u'35087.8925933298'}}
expected = pandas.DataFrame(expected)
result = download(country=['CA','MX','US', 'junk'], indicator=['GDPPCKD',
'GDPPCKN', 'junk'], start=2003, end=2005)
expected.index = result.index
assert_frame_equal(result, pandas.DataFrame(expected))

183 changes: 183 additions & 0 deletions pandas/io/wb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import urllib2
import warnings
import json
import pandas
import numpy as np

def download(country=['MX','CA','US'], indicator=['GDPPCKD','GDPPCKN'],
start=2003, end=2005):
"""
Download data series from the World Bank's World Development Indicators

Parameters
----------

indicator: string or list of strings
taken from the ``id`` field in ``WDIsearch()``
country: string or list of strings.
``all`` downloads data for all countries
ISO-2 character codes select individual countries (e.g.``US``,``CA``)
start: int
First year of the data series
end: int
Last year of the data series (inclusive)

Returns
-------

``pandas`` DataFrame with columns: country, iso2c, year, indicator value.
"""

# Are ISO-2 country codes valid?
valid_countries = ["AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BB",
"BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BO", "BR", "BS", "BW",
"BY", "BZ", "CA", "CD", "CF", "CG", "CH", "CI", "CL", "CM", "CN",
"CO", "CR", "CV", "CY", "CZ", "DE", "DK", "DM", "DO", "DZ", "EC",
"EE", "EG", "ER", "ES", "ET", "FI", "FJ", "FR", "GA", "GB", "GE",
"GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HK", "HN", "HR",
"HT", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM", "JO",
"JP", "KE", "KG", "KH", "KM", "KR", "KW", "KZ", "LA", "LB", "LC",
"LK", "LS", "LT", "LU", "LV", "MA", "MD", "MG", "MK", "ML", "MN",
"MR", "MU", "MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL",
"NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PT",
"PY", "RO", "RU", "RW", "SA", "SB", "SC", "SD", "SE", "SG", "SI",
"SK", "SL", "SN", "SR", "SV", "SY", "SZ", "TD", "TG", "TH", "TN",
"TR", "TT", "TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE",
"VN", "VU", "YE", "ZA", "ZM", "ZW", "all"]
if type(country) == str:
country = [country]
bad_countries = np.setdiff1d(country, valid_countries)
country = np.intersect1d(country, valid_countries)
country = ';'.join(country)
# Work with a list of indicators
if type(indicator) == str:
indicator = [indicator]
# Download
data = []
bad_indicators = []
for ind in indicator:
try:
tmp = _get_data(ind, country, start, end)
tmp.columns = ['country', 'iso2c', 'year', ind]
data.append(tmp)
except:
bad_indicators.append(ind)
# Warn
if len(bad_indicators) > 0:
print 'Failed to obtain indicator(s): ' + '; '.join(bad_indicators)
print 'The data may still be available for download at http://data.worldbank.org'
if len(bad_countries) > 0:
print 'Invalid ISO-2 codes: ' + ' '.join(bad_countries)
# Merge WDI series
if len(data) > 0:
out = reduce(lambda x,y: x.merge(y, how='outer'), data)
# Clean
out = out.drop('iso2c', axis=1)
out = out.set_index(['country', 'year'])
return out


def _get_data(indicator = "NY.GNS.ICTR.GN.ZS", country = 'US',
start = 2002, end = 2005):
# Build URL for api call
url = "http://api.worldbank.org/countries/" + country + "/indicators/" + \
indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000" + \
"&format=json"
# Download
response = urllib2.urlopen(url)
data = response.read()
# Parse JSON file
data = json.loads(data)[1]
country = map(lambda x: x['country']['value'], data)
iso2c = map(lambda x: x['country']['id'], data)
year = map(lambda x: x['date'], data)
value = map(lambda x: x['value'], data)
# Prepare output
out = pandas.DataFrame([country, iso2c, year, value]).T
return out


def get_countries():
'''Query information about countries
'''
url = 'http://api.worldbank.org/countries/all?format=json'
response = urllib2.urlopen(url)
data = response.read()
data = json.loads(data)[1]
data = pandas.DataFrame(data)
data.adminregion = map(lambda x: x['value'], data.adminregion)
data.incomeLevel = map(lambda x: x['value'], data.incomeLevel)
data.lendingType = map(lambda x: x['value'], data.lendingType)
data.region = map(lambda x: x['value'], data.region)
data = data.rename(columns={'id':'iso3c', 'iso2Code':'iso2c'})
return data


def get_indicators():
'''Download information about all World Bank data series
'''
url = 'http://api.worldbank.org/indicators?per_page=50000&format=json'
response = urllib2.urlopen(url)
data = response.read()
data = json.loads(data)[1]
data = pandas.DataFrame(data)
# Clean fields
data.source = map(lambda x: x['value'], data.source)
fun = lambda x: x.encode('ascii', 'ignore')
data.sourceOrganization = data.sourceOrganization.apply(fun)
# Clean topic field
def get_value(x):
try:
return x['value']
except:
return ''
fun = lambda x: map(lambda y: get_value(y), x)
data.topics = data.topics.apply(fun)
data.topics = data.topics.apply(lambda x: ' ; '.join(x))
# Clean outpu
data = data.sort(columns='id')
data.index = pandas.Index(range(data.shape[0]))
return data


_cached_series = None
def search(string='gdp.*capi', field='name', case=False):
"""
Search available data series from the world bank

Parameters
----------

string: string
regular expression
field: string
id, name, source, sourceNote, sourceOrganization, topics
See notes below
case: bool
case sensitive search?

Notes
-----

The first time this function is run it will download and cache the full
list of available series. Depending on the speed of your network
connection, this can take time. Subsequent searches will use the cached
copy, so they should be much faster.

id : Data series indicator (for use with the ``indicator`` argument of
``WDI()``) e.g. NY.GNS.ICTR.GN.ZS"
name: Short description of the data series
source: Data collection project
sourceOrganization: Data collection organization
note:
sourceNote:
topics:
"""
# Create cached list of series if it does not exist
global _cached_series
if type(_cached_series) is not pandas.core.frame.DataFrame:
_cached_series = get_indicators()
data = _cached_series[field]
idx = data.str.contains(string, case=case)
out = _cached_series.ix[idx].dropna()
return out