pandas-dev · vincentarelbundock · Dec 18, 2012 · Dec 21, 2012 · Dec 23, 2012 · Dec 24, 2012
diff --git a/.travis.yml b/.travis.yml
@@ -3,7 +3,7 @@ language: python
 python:
   - 2.6
   - 2.7
-  - 3.1 # travis will soon EOL this
+#  - 3.1 # travis EOL
   - 3.2
   - 3.3
 
@@ -45,8 +45,10 @@ before_install:
 install:
   - echo "Waldo2"
   - ci/install.sh
-  - ci/print_versions.py # not including stats
 
 script:
   - echo "Waldo3"
   - ci/script.sh
+
+after_script:
+  - ci/print_versions.py
diff --git a/ci/before_install.sh b/ci/before_install.sh
@@ -9,20 +9,20 @@ fi
 
 sudo apt-get update $APT_ARGS # run apt-get update for all versions
 
-# hack for broken 3.3 env
-if [ x"$VIRTUAL_ENV" == x"" ]; then
-    VIRTUAL_ENV=~/virtualenv/python$TRAVIS_PYTHON_VERSION_with_system_site_packages;
-fi
+# # hack for broken 3.3 env
+# if [ x"$VIRTUAL_ENV" == x"" ]; then
+#     VIRTUAL_ENV=~/virtualenv/python$TRAVIS_PYTHON_VERSION_with_system_site_packages;
+# fi
 
-# we only recreate the virtualenv for 3.x
-# since the "Detach bug" only affects python3
-# and travis has numpy preinstalled on 2.x which is quicker
-_VENV=$VIRTUAL_ENV # save it
-if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ] ; then
-    deactivate # pop out of any venv
-    sudo pip install virtualenv==1.8.4 --upgrade
-    sudo apt-get install $APT_ARGS python3.3 python3.3-dev
-    sudo rm -Rf $_VENV
-    virtualenv -p python$TRAVIS_PYTHON_VERSION $_VENV --system-site-packages;
-    source $_VENV/bin/activate
-fi
+# # we only recreate the virtualenv for 3.x
+# # since the "Detach bug" only affects python3
+# # and travis has numpy preinstalled on 2.x which is quicker
+# _VENV=$VIRTUAL_ENV # save it
+# if [ ${TRAVIS_PYTHON_VERSION:0:1} == "3" ] ; then
+#     deactivate # pop out of any venv
+#     sudo pip install virtualenv==1.8.4 --upgrade
+#     sudo apt-get install $APT_ARGS python3.3 python3.3-dev
+#     sudo rm -Rf $_VENV
+#     virtualenv -p python$TRAVIS_PYTHON_VERSION $_VENV --system-site-packages;
+#     source $_VENV/bin/activate
+# fi
diff --git a/ci/install.sh b/ci/install.sh
@@ -21,7 +21,7 @@ fi
 if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ] || \
    [ ${TRAVIS_PYTHON_VERSION}     == "3.1" ] || \
    [ ${TRAVIS_PYTHON_VERSION}     == "3.2" ]; then
-     pip $PIP_ARGS install numpy; #https://github.com/y-p/numpy/archive/1.6.2_with_travis_fix.tar.gz;
+     pip $PIP_ARGS install numpy;
 else
     pip $PIP_ARGS install https://github.com/numpy/numpy/archive/v1.7.0b2.tar.gz;
 fi

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3240,7 +3240,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
 
         Parameters
         ----------
-        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'
+        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
             Method to use for filling holes in reindexed Series
             pad / ffill: propagate last valid observation forward to next valid
             backfill / bfill: use NEXT valid observation to fill gap

diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py
@@ -0,0 +1,30 @@
+import pandas
+from pandas.util.testing import network
+from pandas.util.testing import assert_frame_equal
+from numpy.testing.decorators import slow
+from pandas.io.wdi import (search, download)
+
+@slow
+@network
+def test_wdi_search():
+    expected = {u'id': {2634: u'GDPPCKD',
+      4649: u'NY.GDP.PCAP.KD',
+      4651: u'NY.GDP.PCAP.KN',
+      4653: u'NY.GDP.PCAP.PP.KD'},
+     u'name': {2634: u'GDP per Capita, constant US$, millions',
+      4649: u'GDP per capita (constant 2000 US$)',
+      4651: u'GDP per capita (constant LCU)',
+      4653: u'GDP per capita, PPP (constant 2005 international $)'}}
+    result = search('gdp.*capita.*constant').ix[:,:2]
+    assert_frame_equal(result, pandas.DataFrame(expected))
+
+@slow
+@network
+def test_wdi_download():
+    expected = {'GDPPCKN': {(u'United States', u'2003'): u'40800.0735367688', (u'Canada', u'2004'): u'37857.1261134552', (u'United States', u'2005'): u'42714.8594790102', (u'Canada', u'2003'): u'37081.4575704003', (u'United States', u'2004'): u'41826.1728310667', (u'Mexico', u'2003'): u'72720.0691255285', (u'Mexico', u'2004'): u'74751.6003347038', (u'Mexico', u'2005'): u'76200.2154469437', (u'Canada', u'2005'): u'38617.4563629611'}, 'GDPPCKD': {(u'United States', u'2003'): u'40800.0735367688', (u'Canada', u'2004'): u'34397.055116118', (u'United States', u'2005'): u'42714.8594790102', (u'Canada', u'2003'): u'33692.2812368928', (u'United States', u'2004'): u'41826.1728310667', (u'Mexico', u'2003'): u'7608.43848670658', (u'Mexico', u'2004'): u'7820.99026814334', (u'Mexico', u'2005'): u'7972.55364129367', (u'Canada', u'2005'): u'35087.8925933298'}}
+    expected = pandas.DataFrame(expected)
+    result = download(country=['CA','MX','US', 'junk'], indicator=['GDPPCKD', 
+                'GDPPCKN', 'junk'], start=2003, end=2005)
+    expected.index = result.index 
+    assert_frame_equal(result, pandas.DataFrame(expected))
+
diff --git a/pandas/io/wb.py b/pandas/io/wb.py
@@ -0,0 +1,183 @@
+import urllib2
+import warnings
+import json
+import pandas
+import numpy as np
+
+def download(country=['MX','CA','US'], indicator=['GDPPCKD','GDPPCKN'], 
+        start=2003, end=2005):
+    """
+    Download data series from the World Bank's World Development Indicators
+
+    Parameters
+    ----------
+
+    indicator: string or list of strings 
+        taken from the ``id`` field in ``WDIsearch()``
+    country: string or list of strings. 
+        ``all`` downloads data for all countries 
+        ISO-2 character codes select individual countries (e.g.``US``,``CA``)
+    start: int 
+        First year of the data series
+    end: int 
+        Last year of the data series (inclusive)
+
+    Returns
+    -------
+
+    ``pandas`` DataFrame with columns: country, iso2c, year, indicator value. 
+    """
+
+    # Are ISO-2 country codes valid?
+    valid_countries = ["AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BB",
+            "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BO", "BR", "BS", "BW",
+            "BY", "BZ", "CA", "CD", "CF", "CG", "CH", "CI", "CL", "CM", "CN",
+            "CO", "CR", "CV", "CY", "CZ", "DE", "DK", "DM", "DO", "DZ", "EC",
+            "EE", "EG", "ER", "ES", "ET", "FI", "FJ", "FR", "GA", "GB", "GE",
+            "GH", "GM", "GN", "GQ", "GR", "GT", "GW", "GY", "HK", "HN", "HR",
+            "HT", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM", "JO",
+            "JP", "KE", "KG", "KH", "KM", "KR", "KW", "KZ", "LA", "LB", "LC",
+            "LK", "LS", "LT", "LU", "LV", "MA", "MD", "MG", "MK", "ML", "MN",
+            "MR", "MU", "MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL",
+            "NO", "NP", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL", "PT",
+            "PY", "RO", "RU", "RW", "SA", "SB", "SC", "SD", "SE", "SG", "SI",
+            "SK", "SL", "SN", "SR", "SV", "SY", "SZ", "TD", "TG", "TH", "TN",
+            "TR", "TT", "TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC", "VE",
+            "VN", "VU", "YE", "ZA", "ZM", "ZW", "all"]    
+    if type(country) == str:
+        country = [country]
+    bad_countries = np.setdiff1d(country, valid_countries)
+    country = np.intersect1d(country, valid_countries)
+    country = ';'.join(country)
+    # Work with a list of indicators
+    if type(indicator) == str:
+        indicator = [indicator]    
+    # Download
+    data = [] 
+    bad_indicators = []
+    for ind in indicator:
+        try: 
+            tmp = _get_data(ind, country, start, end)
+            tmp.columns = ['country', 'iso2c', 'year', ind]
+            data.append(tmp)
+        except:
+            bad_indicators.append(ind) 
+    # Warn
+    if len(bad_indicators) > 0:
+        print 'Failed to obtain indicator(s): ' + '; '.join(bad_indicators)
+        print 'The data may still be available for download at http://data.worldbank.org'
+    if len(bad_countries) > 0:
+        print 'Invalid ISO-2 codes: ' + ' '.join(bad_countries)
+    # Merge WDI series
+    if len(data) > 0:
+        out = reduce(lambda x,y: x.merge(y, how='outer'), data)
+        # Clean
+        out = out.drop('iso2c', axis=1)
+        out = out.set_index(['country', 'year'])
+        return out
+
+
+def _get_data(indicator = "NY.GNS.ICTR.GN.ZS", country = 'US', 
+                start = 2002, end = 2005):
+    # Build URL for api call
+    url = "http://api.worldbank.org/countries/" + country + "/indicators/" + \
+        indicator + "?date=" + str(start) + ":" + str(end) + "&per_page=25000" + \
+        "&format=json"
+    # Download
+    response = urllib2.urlopen(url)
+    data = response.read()
+    # Parse JSON file
+    data = json.loads(data)[1]
+    country = map(lambda x: x['country']['value'], data)
+    iso2c = map(lambda x: x['country']['id'], data)
+    year = map(lambda x: x['date'], data)
+    value = map(lambda x: x['value'], data)
+    # Prepare output
+    out = pandas.DataFrame([country, iso2c, year, value]).T
+    return out
+
+
+def get_countries():
+    '''Query information about countries
+    '''
+    url = 'http://api.worldbank.org/countries/all?format=json'
+    response = urllib2.urlopen(url)
+    data = response.read()
+    data = json.loads(data)[1]
+    data = pandas.DataFrame(data)
+    data.adminregion = map(lambda x: x['value'], data.adminregion)
+    data.incomeLevel = map(lambda x: x['value'], data.incomeLevel)
+    data.lendingType = map(lambda x: x['value'], data.lendingType)
+    data.region = map(lambda x: x['value'], data.region)
+    data = data.rename(columns={'id':'iso3c', 'iso2Code':'iso2c'})
+    return data
+
+
+def get_indicators():
+    '''Download information about all World Bank data series
+    '''
+    url = 'http://api.worldbank.org/indicators?per_page=50000&format=json'
+    response = urllib2.urlopen(url)
+    data = response.read()
+    data = json.loads(data)[1]
+    data = pandas.DataFrame(data)
+    # Clean fields
+    data.source = map(lambda x: x['value'], data.source)
+    fun = lambda x: x.encode('ascii', 'ignore')
+    data.sourceOrganization = data.sourceOrganization.apply(fun)
+    # Clean topic field
+    def get_value(x):
+        try: 
+            return x['value']
+        except:
+            return ''
+    fun = lambda x: map(lambda y: get_value(y), x)
+    data.topics = data.topics.apply(fun)
+    data.topics = data.topics.apply(lambda x: ' ; '.join(x))
+    # Clean outpu
+    data = data.sort(columns='id')
+    data.index = pandas.Index(range(data.shape[0]))
+    return data
+
+
+_cached_series = None
+def search(string='gdp.*capi', field='name', case=False):
+    """
+    Search available data series from the world bank
+
+    Parameters
+    ----------
+
+    string: string
+        regular expression
+    field: string 
+        id, name, source, sourceNote, sourceOrganization, topics
+        See notes below
+    case: bool
+        case sensitive search?
+
+    Notes
+    -----
+
+    The first time this function is run it will download and cache the full
+    list of available series. Depending on the speed of your network
+    connection, this can take time. Subsequent searches will use the cached
+    copy, so they should be much faster. 
+
+    id : Data series indicator (for use with the ``indicator`` argument of
+    ``WDI()``) e.g. NY.GNS.ICTR.GN.ZS"
+    name: Short description of the data series 
+    source: Data collection project 
+    sourceOrganization: Data collection organization 
+    note: 
+    sourceNote: 
+    topics: 
+    """
+    # Create cached list of series if it does not exist
+    global _cached_series
+    if type(_cached_series) is not pandas.core.frame.DataFrame:
+        _cached_series = get_indicators()
+    data = _cached_series[field]
+    idx = data.str.contains(string, case=case)
+    out = _cached_series.ix[idx].dropna()
+    return out