Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Misc fixes for SparseSeries indexing with MI #13163

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ Performance Improvements

Bug Fixes
~~~~~~~~~

- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`)
- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`)
- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`)


Expand Down
4 changes: 2 additions & 2 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None):
def get_value(self, series, key):
# somewhat broken encapsulation
from pandas.core.indexing import maybe_droplevels
from pandas.core.series import Series

# Label-based
s = _values_from_object(series)
Expand All @@ -604,7 +603,8 @@ def _try_mi(k):
new_values = series._values[loc]
new_index = self[loc]
new_index = maybe_droplevels(new_index, k)
return Series(new_values, index=new_index, name=series.name)
return series._constructor(new_values, index=new_index,
name=series.name).__finalize__(self)

try:
return self._engine.get_value(s, k)
Expand Down
20 changes: 10 additions & 10 deletions pandas/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@

# pylint: disable=E1101,E1103,W0231

from numpy import nan, ndarray
import numpy as np
import warnings
import operator

from pandas.compat.numpy import function as nv
from pandas.core.common import isnull, _values_from_object, _maybe_match_name
from pandas.core.index import Index, _ensure_index
from pandas.core.index import Index, _ensure_index, InvalidIndexError
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.internals import SingleBlockManager
Expand Down Expand Up @@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
if is_sparse_array:
fill_value = data.fill_value
else:
fill_value = nan
fill_value = np.nan

if is_sparse_array:
if isinstance(data, SparseSeries) and index is None:
Expand Down Expand Up @@ -393,8 +392,10 @@ def _get_val_at(self, loc):

def __getitem__(self, key):
try:
return self._get_val_at(self.index.get_loc(key))
return self.index.get_value(self, key)

except InvalidIndexError:
pass
except KeyError:
if isinstance(key, (int, np.integer)):
return self._get_val_at(key)
Expand All @@ -406,13 +407,12 @@ def __getitem__(self, key):
# Could not hash item, must be array-like?
pass

# is there a case where this would NOT be an ndarray?
# need to find an example, I took out the case for now

key = _values_from_object(key)
dataSlice = self.values[key]
new_index = Index(self.index.view(ndarray)[key])
return self._constructor(dataSlice, index=new_index).__finalize__(self)
if self.index.nlevels > 1 and isinstance(key, tuple):
# to handle MultiIndex labels
key = self.index.get_loc(key)
return self._constructor(self.values[key],
index=self.index[key]).__finalize__(self)

def _get_values(self, indexer):
try:
Expand Down
60 changes: 60 additions & 0 deletions pandas/sparse/tests/test_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
from __future__ import print_function

import numpy as np
import pandas as pd

import pandas.util.testing as tm
from pandas.compat import (is_platform_windows,
is_platform_32bit)
from pandas.core.config import option_context


use_32bit_repr = is_platform_windows() or is_platform_32bit()


class TestSeriesFormatting(tm.TestCase):

_multiprocess_can_split_ = True

def test_sparse_max_row(self):
s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
result = repr(s)
dtype = '' if use_32bit_repr else ', dtype=int32'
exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n"
"4 NaN\ndtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

with option_context("display.max_rows", 3):
# GH 10560
result = repr(s)
exp = ("0 1.0\n ... \n4 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

def test_sparse_mi_max_row(self):
idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
('C', 0), ('C', 1), ('C', 2)])
s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan],
index=idx).to_sparse()
result = repr(s)
dtype = '' if use_32bit_repr else ', dtype=int32'
exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n"
"C 0 3.0\n 1 NaN\n 2 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3], dtype=int32)\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

with option_context("display.max_rows", 3):
# GH 13144
result = repr(s)
exp = ("A 0 1.0\n ... \nC 2 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3], dtype=int32)\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)
142 changes: 130 additions & 12 deletions pandas/sparse/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase):

_multiprocess_can_split_ = True

def setUp(self):
self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
self.sparse = self.orig.to_sparse()

def test_getitem(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse[0], 1)
self.assertTrue(np.isnan(sparse[1]))
Expand All @@ -33,8 +37,9 @@ def test_getitem(self):
tm.assert_sp_series_equal(result, exp)

def test_getitem_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse())
tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse())
tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse())
Expand Down Expand Up @@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self):
orig[-5:].to_sparse(fill_value=0))

def test_loc(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.loc[0], 1)
self.assertTrue(np.isnan(sparse.loc[1]))
Expand Down Expand Up @@ -154,19 +159,26 @@ def test_loc_index_fill_value(self):
tm.assert_sp_series_equal(result, exp)

def test_loc_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse
tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse())

def test_loc_slice_index_fill_value(self):
orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE'))
sparse = orig.to_sparse(fill_value=0)

tm.assert_sp_series_equal(sparse.loc['C':],
orig.loc['C':].to_sparse(fill_value=0))

def test_loc_slice_fill_value(self):
orig = pd.Series([1, np.nan, 0, 3, 0])
sparse = orig.to_sparse(fill_value=0)
tm.assert_sp_series_equal(sparse.loc[2:],
orig.loc[2:].to_sparse(fill_value=0))

def test_iloc(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.iloc[3], 3)
self.assertTrue(np.isnan(sparse.iloc[2]))
Expand Down Expand Up @@ -234,8 +246,9 @@ def test_at_fill_value(self):
self.assertEqual(sparse.at['e'], orig.at['e'])

def test_iat(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.iat[0], orig.iat[0])
self.assertTrue(np.isnan(sparse.iat[1]))
self.assertTrue(np.isnan(sparse.iat[2]))
Expand Down Expand Up @@ -356,6 +369,111 @@ def test_reindex_fill_value(self):
tm.assert_sp_series_equal(res, exp)


class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing):

_multiprocess_can_split_ = True

def setUp(self):
# Mi with duplicated values
idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0),
('C', 0), ('C', 1)])
self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx)
self.sparse = self.orig.to_sparse()

def test_getitem_multi(self):
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse[0], orig[0])
self.assertTrue(np.isnan(sparse[1]))
self.assertEqual(sparse[3], orig[3])

tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse())
tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse())

result = sparse[[1, 3, 4]]
exp = orig[[1, 3, 4]].to_sparse()
tm.assert_sp_series_equal(result, exp)

# dense array
result = sparse[orig % 2 == 1]
exp = orig[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse[sparse % 2 == 1]
exp = orig[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

def test_getitem_multi_tuple(self):
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse['C', 0], orig['C', 0])
self.assertTrue(np.isnan(sparse['A', 1]))
self.assertTrue(np.isnan(sparse['B', 0]))

def test_getitems_slice_multi(self):
orig = self.orig
sparse = self.sparse

tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse())
tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())

tm.assert_sp_series_equal(sparse.loc['A':'B'],
orig.loc['A':'B'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())

def test_loc(self):
# need to be override to use different label
orig = self.orig
sparse = self.sparse

tm.assert_sp_series_equal(sparse.loc['A'],
orig.loc['A'].to_sparse())
tm.assert_sp_series_equal(sparse.loc['B'],
orig.loc['B'].to_sparse())

result = sparse.loc[[1, 3, 4]]
exp = orig.loc[[1, 3, 4]].to_sparse()
tm.assert_sp_series_equal(result, exp)

# exceeds the bounds
result = sparse.loc[[1, 3, 4, 5]]
exp = orig.loc[[1, 3, 4, 5]].to_sparse()
tm.assert_sp_series_equal(result, exp)

# dense array
result = sparse.loc[orig % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

def test_loc_multi_tuple(self):
orig = self.orig
sparse = self.sparse

self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0])
self.assertTrue(np.isnan(sparse.loc['A', 1]))
self.assertTrue(np.isnan(sparse.loc['B', 0]))

def test_loc_slice(self):
orig = self.orig
sparse = self.sparse
tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse())
tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse())
tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse())

tm.assert_sp_series_equal(sparse.loc['A':'B'],
orig.loc['A':'B'].to_sparse())
tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse())


class TestSparseDataFrameIndexing(tm.TestCase):

_multiprocess_can_split_ = True
Expand Down
9 changes: 9 additions & 0 deletions pandas/sparse/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,6 +1019,15 @@ def test_from_coo_nodense_index(self):
check = check.dropna().to_sparse()
tm.assert_sp_series_equal(ss, check)

def test_from_coo_long_repr(self):
# GH 13114
# test it doesn't raise error. Formatting is tested in test_format
tm._skip_if_no_scipy()
import scipy.sparse

sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18))
repr(sparse)

def _run_test(self, ss, kwargs, check):
results = ss.to_coo(**kwargs)
self._check_results_to_coo(results, check)
Expand Down
19 changes: 0 additions & 19 deletions pandas/tests/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -3758,25 +3758,6 @@ def test_to_string_header(self):
exp = '0 0\n ..\n9 9'
self.assertEqual(res, exp)

def test_sparse_max_row(self):
s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse()
result = repr(s)
dtype = '' if use_32bit_repr else ', dtype=int32'
exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n"
"4 NaN\ndtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)

with option_context("display.max_rows", 3):
# GH 10560
result = repr(s)
exp = ("0 1.0\n ... \n4 NaN\n"
"dtype: float64\nBlockIndex\n"
"Block locations: array([0, 3]{0})\n"
"Block lengths: array([1, 1]{0})".format(dtype))
self.assertEqual(result, exp)


class TestEngFormatter(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down