BUG: Misc fixes for SparseSeries indexing with MI

closes #13144 Author: sinhrks <sinhrks@gmail.com> Closes #13163 from sinhrks/sparse_multi and squashes the following commits: eb24102 [sinhrks] BUG: Misc fixes for SparseSeries indexing with MI
pandas-dev · May 13, 2016 · 00d4ec3 · 00d4ec3
1 parent e5c18b4
commit 00d4ec3
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 43 deletions.
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -106,6 +106,9 @@ Performance Improvements
 
 Bug Fixes
 ~~~~~~~~~
+
+- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
+- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
 - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
 - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
 

diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
@@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None):
     def get_value(self, series, key):
         # somewhat broken encapsulation
         from pandas.core.indexing import maybe_droplevels
-        from pandas.core.series import Series
 
         # Label-based
         s = _values_from_object(series)
@@ -604,7 +603,8 @@ def _try_mi(k):
             new_values = series._values[loc]
             new_index = self[loc]
             new_index = maybe_droplevels(new_index, k)
-            return Series(new_values, index=new_index, name=series.name)
+            return series._constructor(new_values, index=new_index,
+                                       name=series.name).__finalize__(self)
 
         try:
             return self._engine.get_value(s, k)

diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py
@@ -5,14 +5,13 @@
 
 # pylint: disable=E1101,E1103,W0231
 
-from numpy import nan, ndarray
 import numpy as np
 import warnings
 import operator
 
 from pandas.compat.numpy import function as nv
 from pandas.core.common import isnull, _values_from_object, _maybe_match_name
-from pandas.core.index import Index, _ensure_index
+from pandas.core.index import Index, _ensure_index, InvalidIndexError
 from pandas.core.series import Series
 from pandas.core.frame import DataFrame
 from pandas.core.internals import SingleBlockManager
@@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
                 if is_sparse_array:
                     fill_value = data.fill_value
                 else:
-                    fill_value = nan
+                    fill_value = np.nan
 
             if is_sparse_array:
                 if isinstance(data, SparseSeries) and index is None:
@@ -393,8 +392,10 @@ def _get_val_at(self, loc):
 
     def __getitem__(self, key):
         try:
-            return self._get_val_at(self.index.get_loc(key))
+            return self.index.get_value(self, key)
 
+        except InvalidIndexError:
+            pass
         except KeyError:
             if isinstance(key, (int, np.integer)):
                 return self._get_val_at(key)
@@ -406,13 +407,12 @@ def __getitem__(self, key):
             # Could not hash item, must be array-like?
             pass
 
-        # is there a case where this would NOT be an ndarray?
-        # need to find an example, I took out the case for now
-
         key = _values_from_object(key)
-        dataSlice = self.values[key]
-        new_index = Index(self.index.view(ndarray)[key])
-        return self._constructor(dataSlice, index=new_index).__finalize__(self)
+        if self.index.nlevels > 1 and isinstance(key, tuple):
+            # to handle MultiIndex labels
+            key = self.index.get_loc(key)
+        return self._constructor(self.values[key],
+                                 index=self.index[key]).__finalize__(self)
 
     def _get_values(self, indexer):
         try:

diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function
+
+import numpy as np
+import pandas as pd
+
+import pandas.util.testing as tm
+from pandas.compat import (is_platform_windows,
+                           is_platform_32bit)
+from pandas.core.config import option_context
+
+
+use_32bit_repr = is_platform_windows() or is_platform_32bit()
+
+
+class TestSeriesFormatting(tm.TestCase):
+
+    _multiprocess_can_split_ = True
+
+    def test_sparse_max_row(self):
+        s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
+        result = repr(s)
+        dtype = '' if use_32bit_repr else ', dtype=int32'
+        exp = ("0    1.0\n1    NaN\n2    NaN\n3    3.0\n"
+               "4    NaN\ndtype: float64\nBlockIndex\n"
+               "Block locations: array([0, 3]{0})\n"
+               "Block lengths: array([1, 1]{0})".format(dtype))
+        self.assertEqual(result, exp)
+
+        with option_context("display.max_rows", 3):
+            # GH 10560
+            result = repr(s)
+            exp = ("0    1.0\n    ... \n4    NaN\n"
+                   "dtype: float64\nBlockIndex\n"
+                   "Block locations: array([0, 3]{0})\n"
+                   "Block lengths: array([1, 1]{0})".format(dtype))
+            self.assertEqual(result, exp)
+
+    def test_sparse_mi_max_row(self):
+        idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
+                                         ('C', 0), ('C', 1), ('C', 2)])
+        s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan],
+                      index=idx).to_sparse()
+        result = repr(s)
+        dtype = '' if use_32bit_repr else ', dtype=int32'
+        exp = ("A  0    1.0\n   1    NaN\nB  0    NaN\n"
+               "C  0    3.0\n   1    NaN\n   2    NaN\n"
+               "dtype: float64\nBlockIndex\n"
+               "Block locations: array([0, 3], dtype=int32)\n"
+               "Block lengths: array([1, 1]{0})".format(dtype))
+        self.assertEqual(result, exp)
+
+        with option_context("display.max_rows", 3):
+            # GH 13144
+            result = repr(s)
+            exp = ("A  0    1.0\n       ... \nC  2    NaN\n"
+                   "dtype: float64\nBlockIndex\n"
+                   "Block locations: array([0, 3], dtype=int32)\n"
+                   "Block lengths: array([1, 1]{0})".format(dtype))
+            self.assertEqual(result, exp)
diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py
@@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase):
 
     _multiprocess_can_split_ = True
 
+    def setUp(self):
+        self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
+        self.sparse = self.orig.to_sparse()
+
     def test_getitem(self):
-        orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
-        sparse = orig.to_sparse()
+        orig = self.orig
+        sparse = self.sparse
 
         self.assertEqual(sparse[0], 1)
         self.assertTrue(np.isnan(sparse[1]))
@@ -33,8 +37,9 @@ def test_getitem(self):
         tm.assert_sp_series_equal(result, exp)
 
     def test_getitem_slice(self):
-        orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
-        sparse = orig.to_sparse()
+        orig = self.orig
+        sparse = self.sparse
+
         tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse())
         tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse())
         tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse())
@@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self):
                                   orig[-5:].to_sparse(fill_value=0))
 
     def test_loc(self):
-        orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
-        sparse = orig.to_sparse()
+        orig = self.orig
+        sparse = self.sparse
 
         self.assertEqual(sparse.loc[0], 1)
         self.assertTrue(np.isnan(sparse.loc[1]))
@@ -154,19 +159,26 @@ def test_loc_index_fill_value(self):
         tm.assert_sp_series_equal(result, exp)
 
     def test_loc_slice(self):
-        orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
-        sparse = orig.to_sparse()
+        orig = self.orig
+        sparse = self.sparse
         tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse())
 
+    def test_loc_slice_index_fill_value(self):
+        orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
+        sparse = orig.to_sparse(fill_value=0)
+
+        tm.assert_sp_series_equal(sparse.loc['C':],
+                                  orig.loc['C':].to_sparse(fill_value=0))
+
     def test_loc_slice_fill_value(self):
         orig = pd.Series([1, np.nan, 0, 3, 0])
         sparse = orig.to_sparse(fill_value=0)
         tm.assert_sp_series_equal(sparse.loc[2:],
                                   orig.loc[2:].to_sparse(fill_value=0))
 
     def test_iloc(self):
-        orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
-        sparse = orig.to_sparse()
+        orig = self.orig
+        sparse = self.sparse
 
         self.assertEqual(sparse.iloc[3], 3)
         self.assertTrue(np.isnan(sparse.iloc[2]))
@@ -234,8 +246,9 @@ def test_at_fill_value(self):
         self.assertEqual(sparse.at['e'], orig.at['e'])
 
     def test_iat(self):
-        orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
-        sparse = orig.to_sparse()
+        orig = self.orig
+        sparse = self.sparse
+
         self.assertEqual(sparse.iat[0], orig.iat[0])
         self.assertTrue(np.isnan(sparse.iat[1]))
         self.assertTrue(np.isnan(sparse.iat[2]))
@@ -356,6 +369,111 @@ def test_reindex_fill_value(self):
         tm.assert_sp_series_equal(res, exp)
 
 
+class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing):
+
+    _multiprocess_can_split_ = True
+
+    def setUp(self):
+        # Mi with duplicated values
+        idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
+                                         ('C', 0), ('C', 1)])
+        self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx)
+        self.sparse = self.orig.to_sparse()
+
+    def test_getitem_multi(self):
+        orig = self.orig
+        sparse = self.sparse
+
+        self.assertEqual(sparse[0], orig[0])
+        self.assertTrue(np.isnan(sparse[1]))
+        self.assertEqual(sparse[3], orig[3])
+
+        tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse())
+        tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse())
+
+        result = sparse[[1, 3, 4]]
+        exp = orig[[1, 3, 4]].to_sparse()
+        tm.assert_sp_series_equal(result, exp)
+
+        # dense array
+        result = sparse[orig % 2 == 1]
+        exp = orig[orig % 2 == 1].to_sparse()
+        tm.assert_sp_series_equal(result, exp)
+
+        # sparse array (actuary it coerces to normal Series)
+        result = sparse[sparse % 2 == 1]
+        exp = orig[orig % 2 == 1].to_sparse()
+        tm.assert_sp_series_equal(result, exp)
+
+    def test_getitem_multi_tuple(self):
+        orig = self.orig
+        sparse = self.sparse
+
+        self.assertEqual(sparse['C', 0], orig['C', 0])
+        self.assertTrue(np.isnan(sparse['A', 1]))
+        self.assertTrue(np.isnan(sparse['B', 0]))
+
+    def test_getitems_slice_multi(self):
+        orig = self.orig
+        sparse = self.sparse
+
+        tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())
+
+        tm.assert_sp_series_equal(sparse.loc['A':'B'],
+                                  orig.loc['A':'B'].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())
+
+    def test_loc(self):
+        # need to be override to use different label
+        orig = self.orig
+        sparse = self.sparse
+
+        tm.assert_sp_series_equal(sparse.loc['A'],
+                                  orig.loc['A'].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['B'],
+                                  orig.loc['B'].to_sparse())
+
+        result = sparse.loc[[1, 3, 4]]
+        exp = orig.loc[[1, 3, 4]].to_sparse()
+        tm.assert_sp_series_equal(result, exp)
+
+        # exceeds the bounds
+        result = sparse.loc[[1, 3, 4, 5]]
+        exp = orig.loc[[1, 3, 4, 5]].to_sparse()
+        tm.assert_sp_series_equal(result, exp)
+
+        # dense array
+        result = sparse.loc[orig % 2 == 1]
+        exp = orig.loc[orig % 2 == 1].to_sparse()
+        tm.assert_sp_series_equal(result, exp)
+
+        # sparse array (actuary it coerces to normal Series)
+        result = sparse.loc[sparse % 2 == 1]
+        exp = orig.loc[orig % 2 == 1].to_sparse()
+        tm.assert_sp_series_equal(result, exp)
+
+    def test_loc_multi_tuple(self):
+        orig = self.orig
+        sparse = self.sparse
+
+        self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0])
+        self.assertTrue(np.isnan(sparse.loc['A', 1]))
+        self.assertTrue(np.isnan(sparse.loc['B', 0]))
+
+    def test_loc_slice(self):
+        orig = self.orig
+        sparse = self.sparse
+        tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())
+
+        tm.assert_sp_series_equal(sparse.loc['A':'B'],
+                                  orig.loc['A':'B'].to_sparse())
+        tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())
+
+
 class TestSparseDataFrameIndexing(tm.TestCase):
 
     _multiprocess_can_split_ = True

diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py
@@ -1019,6 +1019,15 @@ def test_from_coo_nodense_index(self):
         check = check.dropna().to_sparse()
         tm.assert_sp_series_equal(ss, check)
 
+    def test_from_coo_long_repr(self):
+        # GH 13114
+        # test it doesn't raise error. Formatting is tested in test_format
+        tm._skip_if_no_scipy()
+        import scipy.sparse
+
+        sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18))
+        repr(sparse)
+
     def _run_test(self, ss, kwargs, check):
         results = ss.to_coo(**kwargs)
         self._check_results_to_coo(results, check)

diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py
@@ -3758,25 +3758,6 @@ def test_to_string_header(self):
         exp = '0    0\n    ..\n9    9'
         self.assertEqual(res, exp)
 
-    def test_sparse_max_row(self):
-        s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
-        result = repr(s)
-        dtype = '' if use_32bit_repr else ', dtype=int32'
-        exp = ("0    1.0\n1    NaN\n2    NaN\n3    3.0\n"
-               "4    NaN\ndtype: float64\nBlockIndex\n"
-               "Block locations: array([0, 3]{0})\n"
-               "Block lengths: array([1, 1]{0})".format(dtype))
-        self.assertEqual(result, exp)
-
-        with option_context("display.max_rows", 3):
-            # GH 10560
-            result = repr(s)
-            exp = ("0    1.0\n    ... \n4    NaN\n"
-                   "dtype: float64\nBlockIndex\n"
-                   "Block locations: array([0, 3]{0})\n"
-                   "Block lengths: array([1, 1]{0})".format(dtype))
-            self.assertEqual(result, exp)
-
 
 class TestEngFormatter(tm.TestCase):
     _multiprocess_can_split_ = True