Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Sparse misc fixes including __repr__ #12779

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ API changes
- ``CParserError`` is now a ``ValueError`` instead of just an ``Exception`` (:issue:`12551`)

- ``pd.show_versions()`` now includes ``pandas_datareader`` version (:issue:`12740`)
- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others (:issue:`10560`)

.. ipython:: python

s = pd.SparseArray([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6])
s.take(0)
s.take([1, 2, 3])

.. _whatsnew_0181.apply_resample:

Expand Down Expand Up @@ -211,3 +218,9 @@ Bug Fixes
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)


- Bug in ``SparseSeries.loc[]`` with list-like input raises ``TypeError`` (:issue:`10560`)
- Bug in ``SparseSeries.iloc[]`` with scalar input may raise ``IndexError`` (:issue:`10560`)
- Bug in ``SparseSeries.loc[]``, ``.iloc[]`` with ``slice`` returns ``SparseArray``, rather than ``SparseSeries`` (:issue:`10560`)
- Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
3 changes: 1 addition & 2 deletions pandas/io/tests/test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import pandas as pd
from pandas import Index
from pandas.compat import u
from pandas.sparse.tests import test_sparse
from pandas.util.misc import is_little_endian
import pandas
import pandas.util.testing as tm
Expand Down Expand Up @@ -46,7 +45,7 @@ def compare_element(self, result, expected, typ, version=None):
return

if typ.startswith('sp_'):
comparator = getattr(test_sparse, "assert_%s_equal" % typ)
comparator = getattr(tm, "assert_%s_equal" % typ)
comparator(result, expected, exact_indices=False)
else:
comparator = getattr(tm, "assert_%s_equal" %
Expand Down
12 changes: 7 additions & 5 deletions pandas/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,7 @@ def __getitem__(self, key):
if isinstance(key, SparseArray):
key = np.asarray(key)
if hasattr(key, '__len__') and len(self) != len(key):
indices = self.sp_index
if hasattr(indices, 'to_int_index'):
indices = indices.to_int_index()
data_slice = self.values.take(indices.indices)[key]
return self.take(key)
else:
data_slice = self.values[key]
return self._constructor(data_slice)
Expand Down Expand Up @@ -320,6 +317,11 @@ def take(self, indices, axis=0):
"""
if axis:
raise ValueError("axis must be 0, input was {0}".format(axis))

if com.is_integer(indices):
# return scalar
return self[indices]

indices = np.atleast_1d(np.asarray(indices, dtype=int))

# allow -1 to indicate missing values
Expand All @@ -344,7 +346,7 @@ def take(self, indices, axis=0):
result = np.empty(len(indices))
result.fill(self.fill_value)

return result
return self._constructor(result)

def __setitem__(self, key, value):
# if com.is_integer(key):
Expand Down
3 changes: 2 additions & 1 deletion pandas/sparse/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,9 +543,10 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan,
continue

values = series.values
# .take returns SparseArray
new = values.take(indexer)

if need_mask:
new = new.values
np.putmask(new, mask, fill_value)

new_series[col] = new
Expand Down
33 changes: 32 additions & 1 deletion pandas/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,10 +354,33 @@ def _set_subtyp(self, is_all_dates):
else:
object.__setattr__(self, '_subtyp', 'sparse_series')

def _ixs(self, i, axis=0):
"""
Return the i-th value or values in the SparseSeries by location

Parameters
----------
i : int, slice, or sequence of integers

Returns
-------
value : scalar (int) or Series (slice, sequence)
"""
label = self.index[i]
if isinstance(label, Index):
return self.take(i, axis=axis, convert=True)
else:
return self._get_val_at(i)

def _get_val_at(self, loc):
""" forward to the array """
return self.block.values._get_val_at(loc)

def _slice(self, slobj, axis=0, kind=None):
slobj = self.index._convert_slice_indexer(slobj,
kind=kind or 'getitem')
return self._get_values(slobj)

def __getitem__(self, key):
"""

Expand All @@ -382,6 +405,13 @@ def __getitem__(self, key):
new_index = Index(self.index.view(ndarray)[key])
return self._constructor(dataSlice, index=new_index).__finalize__(self)

def _get_values(self, indexer):
try:
return self._constructor(self._data.get_slice(indexer),
fastpath=True).__finalize__(self)
except Exception:
return self[indexer]

def _set_with_engine(self, key, value):
return self.set_value(key, value)

Expand Down Expand Up @@ -517,7 +547,8 @@ def copy(self, deep=True):
return self._constructor(new_data, sparse_index=self.sp_index,
fill_value=self.fill_value).__finalize__(self)

def reindex(self, index=None, method=None, copy=True, limit=None):
def reindex(self, index=None, method=None, copy=True, limit=None,
**kwargs):
"""
Conform SparseSeries to new Index

Expand Down
44 changes: 28 additions & 16 deletions pandas/sparse/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,6 @@
import pandas.util.testing as tm


def assert_sp_array_equal(left, right):
assert_almost_equal(left.sp_values, right.sp_values)
assert (left.sp_index.equals(right.sp_index))
if np.isnan(left.fill_value):
assert (np.isnan(right.fill_value))
else:
assert (left.fill_value == right.fill_value)


class TestSparseArray(tm.TestCase):
_multiprocess_can_split_ = True

Expand All @@ -29,11 +20,32 @@ def setUp(self):
self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0)

def test_get_item(self):

self.assertTrue(np.isnan(self.arr[1]))
self.assertEqual(self.arr[2], 1)
self.assertEqual(self.arr[7], 5)

self.assertEqual(self.zarr[0], 0)
self.assertEqual(self.zarr[2], 1)
self.assertEqual(self.zarr[7], 5)

errmsg = re.compile("bounds")
assertRaisesRegexp(IndexError, errmsg, lambda: self.arr[11])
assertRaisesRegexp(IndexError, errmsg, lambda: self.arr[-11])
self.assertEqual(self.arr[-1], self.arr[len(self.arr) - 1])

def test_take(self):
self.assertTrue(np.isnan(self.arr.take(0)))
self.assertTrue(np.isscalar(self.arr.take(2)))
self.assertEqual(self.arr.take(2), np.take(self.arr_data, 2))
self.assertEqual(self.arr.take(6), np.take(self.arr_data, 6))

tm.assert_sp_array_equal(self.arr.take([2, 3]),
SparseArray(np.take(self.arr_data, [2, 3])))
tm.assert_sp_array_equal(self.arr.take([0, 1, 2]),
SparseArray(np.take(self.arr_data,
[0, 1, 2])))

def test_bad_take(self):
assertRaisesRegexp(IndexError, "bounds", lambda: self.arr.take(11))
self.assertRaises(IndexError, lambda: self.arr.take(-11))
Expand Down Expand Up @@ -96,20 +108,20 @@ def _checkit(i):
def test_getslice(self):
result = self.arr[:-3]
exp = SparseArray(self.arr.values[:-3])
assert_sp_array_equal(result, exp)
tm.assert_sp_array_equal(result, exp)

result = self.arr[-4:]
exp = SparseArray(self.arr.values[-4:])
assert_sp_array_equal(result, exp)
tm.assert_sp_array_equal(result, exp)

# two corner cases from Series
result = self.arr[-12:]
exp = SparseArray(self.arr)
assert_sp_array_equal(result, exp)
tm.assert_sp_array_equal(result, exp)

result = self.arr[:-12]
exp = SparseArray(self.arr.values[:0])
assert_sp_array_equal(result, exp)
tm.assert_sp_array_equal(result, exp)

def test_binary_operators(self):
data1 = np.random.randn(20)
Expand All @@ -134,11 +146,11 @@ def _check_op(op, first, second):

res2 = op(first, second.values)
tm.assertIsInstance(res2, SparseArray)
assert_sp_array_equal(res, res2)
tm.assert_sp_array_equal(res, res2)

res3 = op(first.values, second)
tm.assertIsInstance(res3, SparseArray)
assert_sp_array_equal(res, res3)
tm.assert_sp_array_equal(res, res3)

res4 = op(first, 4)
tm.assertIsInstance(res4, SparseArray)
Expand Down Expand Up @@ -169,7 +181,7 @@ def _check_inplace_op(op):
def test_pickle(self):
def _check_roundtrip(obj):
unpickled = self.round_trip_pickle(obj)
assert_sp_array_equal(unpickled, obj)
tm.assert_sp_array_equal(unpickled, obj)

_check_roundtrip(self.arr)
_check_roundtrip(self.zarr)
Expand Down
84 changes: 84 additions & 0 deletions pandas/sparse/tests/test_indexing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# pylint: disable-msg=E1101,W0612

import nose # noqa
import numpy as np
import pandas as pd
import pandas.util.testing as tm


class TestSparseSeriesIndexing(tm.TestCase):

_multiprocess_can_split_ = True

def test_loc(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()

self.assertEqual(sparse.loc[0], 1)
self.assertTrue(np.isnan(sparse.loc[1]))

result = sparse.loc[[1, 3, 4]]
exp = orig.loc[[1, 3, 4]].to_sparse()
tm.assert_sp_series_equal(result, exp)

# exceeds the bounds
result = sparse.loc[[1, 3, 4, 5]]
exp = orig.loc[[1, 3, 4, 5]].to_sparse()
tm.assert_sp_series_equal(result, exp)
# padded with NaN
self.assertTrue(np.isnan(result[-1]))

# dense array
result = sparse.loc[orig % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

def test_loc_index(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE'))
sparse = orig.to_sparse()

self.assertEqual(sparse.loc['A'], 1)
self.assertTrue(np.isnan(sparse.loc['B']))

result = sparse.loc[['A', 'C', 'D']]
exp = orig.loc[['A', 'C', 'D']].to_sparse()
tm.assert_sp_series_equal(result, exp)

# dense array
result = sparse.loc[orig % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

# sparse array (actuary it coerces to normal Series)
result = sparse.loc[sparse % 2 == 1]
exp = orig.loc[orig % 2 == 1].to_sparse()
tm.assert_sp_series_equal(result, exp)

def test_loc_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse())

def test_iloc(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()

self.assertEqual(sparse.iloc[3], 3)
self.assertTrue(np.isnan(sparse.iloc[2]))

result = sparse.iloc[[1, 3, 4]]
exp = orig.iloc[[1, 3, 4]].to_sparse()
tm.assert_sp_series_equal(result, exp)

with tm.assertRaises(IndexError):
sparse.iloc[[1, 3, 5]]

def test_iloc_slice(self):
orig = pd.Series([1, np.nan, np.nan, 3, np.nan])
sparse = orig.to_sparse()
tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse())
Loading