Skip to content

Commit

Permalink
BUG: Misc fixes for SparseSeries indexing with MI
Browse files Browse the repository at this point in the history
closes #13144

Author: sinhrks <sinhrks@gmail.com>

Closes #13163 from sinhrks/sparse_multi and squashes the following commits:

eb24102 [sinhrks] BUG: Misc fixes for SparseSeries indexing with MI
  • Loading branch information
sinhrks authored and jreback committed May 13, 2016
1 parent e5c18b4 commit 00d4ec3
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 43 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ Performance Improvements

Bug Fixes
~~~~~~~~~

- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)

Expand Down
4 changes: 2 additions & 2 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None):
def get_value(self, series, key):
# somewhat broken encapsulation
from pandas.core.indexing import maybe_droplevels
from pandas.core.series import Series

# Label-based
s = _values_from_object(series)
Expand All @@ -604,7 +603,8 @@ def _try_mi(k):
new_values = series._values[loc]
new_index = self[loc]
new_index = maybe_droplevels(new_index, k)
return Series(new_values, index=new_index, name=series.name)
return series._constructor(new_values, index=new_index,
name=series.name).__finalize__(self)

try:
return self._engine.get_value(s, k)
Expand Down
20 changes: 10 additions & 10 deletions pandas/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@

# pylint: disable=E1101,E1103,W0231

from numpy import nan, ndarray
import numpy as np
import warnings
import operator

from pandas.compat.numpy import function as nv
from pandas.core.common import isnull, _values_from_object, _maybe_match_name
from pandas.core.index import Index, _ensure_index
from pandas.core.index import Index, _ensure_index, InvalidIndexError
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.internals import SingleBlockManager
Expand Down Expand Up @@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
if is_sparse_array:
fill_value = data.fill_value
else:
fill_value = nan
fill_value = np.nan

if is_sparse_array:
if isinstance(data, SparseSeries) and index is None:
Expand Down Expand Up @@ -393,8 +392,10 @@ def _get_val_at(self, loc):

def __getitem__(self, key):
try:
return self._get_val_at(self.index.get_loc(key))
return self.index.get_value(self, key)

except InvalidIndexError:
pass
except KeyError:
if isinstance(key, (int, np.integer)):
return self._get_val_at(key)
Expand All @@ -406,13 +407,12 @@ def __getitem__(self, key):
# Could not hash item, must be array-like?
pass

# is there a case where this would NOT be an ndarray?
# need to find an example, I took out the case for now

key = _values_from_object(key)
dataSlice = self.values[key]
new_index = Index(self.index.view(ndarray)[key])
return self._constructor(dataSlice, index=new_index).__finalize__(self)
if self.index.nlevels > 1 and isinstance(key, tuple):
# to handle MultiIndex labels
key = self.index.get_loc(key)
return self._constructor(self.values[key],
index=self.index[key]).__finalize__(self)

def _get_values(self, indexer):
try:
Expand Down
60 changes: 60 additions & 0 deletions pandas/sparse/tests/test_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
from __future__ import print_function

import numpy as np
import pandas as pd

import pandas.util.testing as tm
from pandas.compat import (is_platform_windows,
is_platform_32bit)
from pandas.core.config import option_context


use_32bit_repr = is_platform_windows() or is_platform_32bit()


class TestSeriesFormatting(tm.TestCase):

_multiprocess_can_split_ = True

def test_sparse_max_row(self):
s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
result = repr(s)
dtype = '' if use_32bit_repr else ', dtype=int32'
exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n"
"4 NaN\ndtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

with option_context("display.max_rows", 3):
# GH 10560
result = repr(s)
exp = ("0 1.0\n ... \n4 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

def test_sparse_mi_max_row(self):
idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
('C', 0), ('C', 1), ('C', 2)])
s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan],
index=idx).to_sparse()
result = repr(s)
dtype = '' if use_32bit_repr else ', dtype=int32'
exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n"
"C 0 3.0\n 1 NaN\n 2 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3], dtype=int32)\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

with option_context("display.max_rows", 3):
# GH 13144
result = repr(s)
exp = ("A 0 1.0\n ... \nC 2 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3], dtype=int32)\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)
142 changes: 130 additions & 12 deletions pandas/sparse/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase):

_multiprocess_can_split_ = True

def setUp(self):
self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
self.sparse = self.orig.to_sparse()

def test_getitem(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse[0], 1)
self.assertTrue(np.isnan(sparse[1]))
Expand All @@ -33,8 +37,9 @@ def test_getitem(self):
tm.assert_sp_series_equal(result, exp)

def test_getitem_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse())
tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse())
tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse())
Expand Down Expand Up @@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self):
orig[-5:].to_sparse(fill_value=0))

def test_loc(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.loc[0], 1)
self.assertTrue(np.isnan(sparse.loc[1]))
Expand Down Expand Up @@ -154,19 +159,26 @@ def test_loc_index_fill_value(self):
tm.assert_sp_series_equal(result, exp)

def test_loc_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse
tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse())

def test_loc_slice_index_fill_value(self):
orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
sparse = orig.to_sparse(fill_value=0)

tm.assert_sp_series_equal(sparse.loc['C':],
orig.loc['C':].to_sparse(fill_value=0))

def test_loc_slice_fill_value(self):
orig = pd.Series([1, np.nan, 0, 3, 0])
sparse = orig.to_sparse(fill_value=0)
tm.assert_sp_series_equal(sparse.loc[2:],
orig.loc[2:].to_sparse(fill_value=0))

def test_iloc(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.iloc[3], 3)
self.assertTrue(np.isnan(sparse.iloc[2]))
Expand Down Expand Up @@ -234,8 +246,9 @@ def test_at_fill_value(self):
self.assertEqual(sparse.at['e'], orig.at['e'])

def test_iat(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.iat[0], orig.iat[0])
self.assertTrue(np.isnan(sparse.iat[1]))
self.assertTrue(np.isnan(sparse.iat[2]))
Expand Down Expand Up @@ -356,6 +369,111 @@ def test_reindex_fill_value(self):
tm.assert_sp_series_equal(res, exp)


class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing):

_multiprocess_can_split_ = True

def setUp(self):
# Mi with duplicated values
idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
('C', 0), ('C', 1)])
self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx)
self.sparse = self.orig.to_sparse()

def test_getitem_multi(self):
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse[0], orig[0])
self.assertTrue(np.isnan(sparse[1]))
self.assertEqual(sparse[3], orig[3])

tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse())
tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse())

result = sparse[[1, 3, 4]]
exp = orig[[1, 3, 4]].to_sparse()
tm.assert_sp_series_equal(result, exp)

# dense array
result = sparse[orig % 2 == 1]
exp = orig[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse[sparse % 2 == 1]
exp = orig[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

def test_getitem_multi_tuple(self):
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse['C', 0], orig['C', 0])
self.assertTrue(np.isnan(sparse['A', 1]))
self.assertTrue(np.isnan(sparse['B', 0]))

def test_getitems_slice_multi(self):
orig = self.orig
sparse = self.sparse

tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse())
tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())

tm.assert_sp_series_equal(sparse.loc['A':'B'],
orig.loc['A':'B'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())

def test_loc(self):
# need to be override to use different label
orig = self.orig
sparse = self.sparse

tm.assert_sp_series_equal(sparse.loc['A'],
orig.loc['A'].to_sparse())
tm.assert_sp_series_equal(sparse.loc['B'],
orig.loc['B'].to_sparse())

result = sparse.loc[[1, 3, 4]]
exp = orig.loc[[1, 3, 4]].to_sparse()
tm.assert_sp_series_equal(result, exp)

# exceeds the bounds
result = sparse.loc[[1, 3, 4, 5]]
exp = orig.loc[[1, 3, 4, 5]].to_sparse()
tm.assert_sp_series_equal(result, exp)

# dense array
result = sparse.loc[orig % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

def test_loc_multi_tuple(self):
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0])
self.assertTrue(np.isnan(sparse.loc['A', 1]))
self.assertTrue(np.isnan(sparse.loc['B', 0]))

def test_loc_slice(self):
orig = self.orig
sparse = self.sparse
tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse())
tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())

tm.assert_sp_series_equal(sparse.loc['A':'B'],
orig.loc['A':'B'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())


class TestSparseDataFrameIndexing(tm.TestCase):

_multiprocess_can_split_ = True
Expand Down
9 changes: 9 additions & 0 deletions pandas/sparse/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,15 @@ def test_from_coo_nodense_index(self):
check = check.dropna().to_sparse()
tm.assert_sp_series_equal(ss, check)

def test_from_coo_long_repr(self):
# GH 13114
# test it doesn't raise error. Formatting is tested in test_format
tm._skip_if_no_scipy()
import scipy.sparse

sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18))
repr(sparse)

def _run_test(self, ss, kwargs, check):
results = ss.to_coo(**kwargs)
self._check_results_to_coo(results, check)
Expand Down
19 changes: 0 additions & 19 deletions pandas/tests/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3758,25 +3758,6 @@ def test_to_string_header(self):
exp = '0 0\n ..\n9 9'
self.assertEqual(res, exp)

def test_sparse_max_row(self):
s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
result = repr(s)
dtype = '' if use_32bit_repr else ', dtype=int32'
exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n"
"4 NaN\ndtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

with option_context("display.max_rows", 3):
# GH 10560
result = repr(s)
exp = ("0 1.0\n ... \n4 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)


class TestEngFormatter(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down

0 comments on commit 00d4ec3

Please sign in to comment.