From c90cddeaa0e9b738305ea92ab9f1643a9993fba6 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 7 Apr 2016 10:33:38 -0400 Subject: [PATCH] ENH/PERF SparseArray.take indexing related to #4400 Added more tests for sparse indexing. `SparseArray.take`` has optimized logic to omit dense ``np.ndarray`` creation. SparseSeires.iloc` can work with negative indices. Made ``SparseArray.take`` to handle negative indices as the same rule as ``Index`` (#12676) Author: sinhrks Closes #12796 from sinhrks/sparse_test_at and squashes the following commits: df1f056 [sinhrks] ENH/PERF SparseArray.take indexing --- doc/source/whatsnew/v0.18.1.txt | 2 +- pandas/core/series.py | 3 - pandas/indexes/base.py | 4 +- pandas/sparse/array.py | 104 +++++++---- pandas/sparse/series.py | 7 +- pandas/sparse/tests/test_array.py | 254 ++++++++++++++++++++++++++- pandas/sparse/tests/test_indexing.py | 172 ++++++++++++++++++ pandas/src/sparse.pyx | 85 ++++++++- pandas/util/testing.py | 5 +- 9 files changed, 567 insertions(+), 69 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index f991be3dc3e10..3e45b2ca37229 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -81,7 +81,7 @@ These changes conform sparse handling to return the correct types and work to ma - Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`) - Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`) - Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`) -- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others (:issue:`10560`) +- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others. Also now it handles negative indexer as the same rule as ``Index`` (:issue:`10560`, :issue:`12796`) .. ipython:: python diff --git a/pandas/core/series.py b/pandas/core/series.py index ac8f073d0f0a1..bf20c5d740133 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -809,9 +809,6 @@ def _set_values(self, key, value): self._data = self._data.setitem(indexer=key, value=value) self._maybe_update_cacher() - # help out SparseSeries - _get_val_at = ndarray.__getitem__ - def repeat(self, reps): """ return a new Series with the values repeated reps times diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 94f85d40c73cc..0e8fe97c2e497 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -1332,7 +1332,7 @@ def _ensure_compat_concat(indexes): return indexes _index_shared_docs['take'] = """ - return a new Index of the values selected by the indices + return a new %(klass)s of the values selected by the indices For internal compatibility with numpy arrays. @@ -1352,7 +1352,7 @@ def _ensure_compat_concat(indexes): numpy.ndarray.take """ - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None): indices = com._ensure_platform_int(indices) if self._can_hold_na: diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index b8a66921fd01d..602098be2901b 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -13,11 +13,16 @@ from pandas import compat, lib from pandas.compat import range -from pandas._sparse import BlockIndex, IntIndex +from pandas._sparse import SparseIndex, BlockIndex, IntIndex import pandas._sparse as splib import pandas.index as _index import pandas.core.ops as ops import pandas.formats.printing as printing +from pandas.util.decorators import Appender +from pandas.indexes.base import _index_shared_docs + + +_sparray_doc_kwargs = dict(klass='SparseArray') def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None, @@ -167,10 +172,19 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', fill_value = bool(fill_value) # Change the class of the array to be the subclass type. - output = subarr.view(cls) - output.sp_index = sparse_index - output.fill_value = fill_value - return output + return cls._simple_new(subarr, sparse_index, fill_value) + + @classmethod + def _simple_new(cls, data, sp_index, fill_value): + result = data.view(cls) + + if not isinstance(sp_index, SparseIndex): + # caller must pass SparseIndex + raise ValueError('sp_index must be a SparseIndex') + + result.sp_index = sp_index + result.fill_value = fill_value + return result @property def _constructor(self): @@ -308,14 +322,12 @@ def _get_val_at(self, loc): else: return _index.get_value_at(self, sp_loc) - def take(self, indices, axis=0): - """ - Sparse-compatible version of ndarray.take + @Appender(_index_shared_docs['take'] % _sparray_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None): + + # Sparse-compatible version of ndarray.take, returns SparseArray - Returns - ------- - taken : ndarray - """ if axis: raise ValueError("axis must be 0, input was {0}".format(axis)) @@ -323,31 +335,40 @@ def take(self, indices, axis=0): # return scalar return self[indices] - indices = np.atleast_1d(np.asarray(indices, dtype=int)) - - # allow -1 to indicate missing values + indices = com._ensure_platform_int(indices) n = len(self) - if ((indices >= n) | (indices < -1)).any(): - raise IndexError('out of bounds access') - - if self.sp_index.npoints > 0: - locs = np.array([self.sp_index.lookup(loc) if loc > -1 else -1 - for loc in indices]) - result = self.sp_values.take(locs) - mask = locs == -1 - if mask.any(): - try: - result[mask] = self.fill_value - except ValueError: - # wrong dtype - result = result.astype('float64') - result[mask] = self.fill_value - + if allow_fill and fill_value is not None: + # allow -1 to indicate self.fill_value, + # self.fill_value may not be NaN + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + elif (n <= indices).any(): + msg = 'index is out of bounds for size {0}' + raise IndexError(msg.format(n)) + else: + if ((indices < -n) | (n <= indices)).any(): + msg = 'index is out of bounds for size {0}' + raise IndexError(msg.format(n)) + + indices = indices.astype(np.int32) + if not (allow_fill and fill_value is not None): + indices = indices.copy() + indices[indices < 0] += n + + locs = self.sp_index.lookup_array(indices) + indexer = np.arange(len(locs), dtype=np.int32) + mask = locs != -1 + if mask.any(): + indexer = indexer[mask] + new_values = self.sp_values.take(locs[mask]) else: - result = np.empty(len(indices)) - result.fill(self.fill_value) + indexer = np.empty(shape=(0, ), dtype=np.int32) + new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype) - return self._constructor(result) + sp_index = _make_index(len(indices), indexer, kind=self.sp_index) + return self._simple_new(new_values, sp_index, self.fill_value) def __setitem__(self, key, value): # if com.is_integer(key): @@ -525,16 +546,21 @@ def make_sparse(arr, kind='block', fill_value=nan): else: indices = np.arange(length, dtype=np.int32)[mask] - if kind == 'block': + index = _make_index(length, indices, kind) + sparsified_values = arr[mask] + return sparsified_values, index + + +def _make_index(length, indices, kind): + + if kind == 'block' or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) index = BlockIndex(length, locs, lens) - elif kind == 'integer': + elif kind == 'integer' or isinstance(kind, IntIndex): index = IntIndex(length, indices) else: # pragma: no cover raise ValueError('must be block or integer type') - - sparsified_values = arr[mask] - return sparsified_values, index + return index ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method, diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 6dedcdbef3174..fdacf1cffb485 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -165,10 +165,10 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if index is None: index = data.index.view() else: + data = data.reindex(index, copy=False) else: - length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): @@ -376,11 +376,6 @@ def _get_val_at(self, loc): """ forward to the array """ return self.block.values._get_val_at(loc) - def _slice(self, slobj, axis=0, kind=None): - slobj = self.index._convert_slice_indexer(slobj, - kind=kind or 'getitem') - return self._get_values(slobj) - def __getitem__(self, key): """ diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 3dd74848107da..076fa71bdd68c 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -1,18 +1,157 @@ from pandas.compat import range import re -from numpy import nan -import numpy as np - import operator import warnings +from numpy import nan +import numpy as np + from pandas import _np_version_under1p8 from pandas.sparse.api import SparseArray +import pandas.sparse.array as sparray from pandas.util.testing import assert_almost_equal, assertRaisesRegexp import pandas.util.testing as tm +class TestSparseArrayIndex(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_int_internal(self): + idx = sparray._make_index(4, np.array([2, 3], dtype=np.int32), + kind='integer') + self.assertIsInstance(idx, sparray.IntIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.indices, + np.array([2, 3], dtype=np.int32)) + + idx = sparray._make_index(4, np.array([], dtype=np.int32), + kind='integer') + self.assertIsInstance(idx, sparray.IntIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.indices, + np.array([], dtype=np.int32)) + + idx = sparray._make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='integer') + self.assertIsInstance(idx, sparray.IntIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.indices, + np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_block_internal(self): + idx = sparray._make_index(4, np.array([2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, sparray.BlockIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.blocs, + np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([2], dtype=np.int32)) + + idx = sparray._make_index(4, np.array([], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, sparray.BlockIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.blocs, + np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([], dtype=np.int32)) + + idx = sparray._make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, sparray.BlockIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([4], dtype=np.int32)) + + idx = sparray._make_index(4, np.array([0, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, sparray.BlockIndex) + self.assertEqual(idx.npoints, 3) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([1, 2], dtype=np.int32)) + + def test_lookup(self): + for kind in ['integer', 'block']: + idx = sparray._make_index(4, np.array([2, 3], dtype=np.int32), + kind=kind) + self.assertEqual(idx.lookup(-1), -1) + self.assertEqual(idx.lookup(0), -1) + self.assertEqual(idx.lookup(1), -1) + self.assertEqual(idx.lookup(2), 0) + self.assertEqual(idx.lookup(3), 1) + self.assertEqual(idx.lookup(4), -1) + + idx = sparray._make_index(4, np.array([], dtype=np.int32), + kind=kind) + for i in range(-1, 5): + self.assertEqual(idx.lookup(i), -1) + + idx = sparray._make_index(4, np.array([0, 1, 2, 3], + dtype=np.int32), kind=kind) + self.assertEqual(idx.lookup(-1), -1) + self.assertEqual(idx.lookup(0), 0) + self.assertEqual(idx.lookup(1), 1) + self.assertEqual(idx.lookup(2), 2) + self.assertEqual(idx.lookup(3), 3) + self.assertEqual(idx.lookup(4), -1) + + idx = sparray._make_index(4, np.array([0, 2, 3], dtype=np.int32), + kind=kind) + self.assertEqual(idx.lookup(-1), -1) + self.assertEqual(idx.lookup(0), 0) + self.assertEqual(idx.lookup(1), -1) + self.assertEqual(idx.lookup(2), 1) + self.assertEqual(idx.lookup(3), 2) + self.assertEqual(idx.lookup(4), -1) + + def test_lookup_array(self): + for kind in ['integer', 'block']: + idx = sparray._make_index(4, np.array([2, 3], dtype=np.int32), + kind=kind) + + res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) + exp = np.array([-1, -1, 0], dtype=np.int32) + self.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32)) + exp = np.array([-1, 0, -1, 1], dtype=np.int32) + self.assert_numpy_array_equal(res, exp) + + idx = sparray._make_index(4, np.array([], dtype=np.int32), + kind=kind) + res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32)) + exp = np.array([-1, -1, -1, -1], dtype=np.int32) + + idx = sparray._make_index(4, np.array([0, 1, 2, 3], + dtype=np.int32), + kind=kind) + res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) + exp = np.array([-1, 0, 2], dtype=np.int32) + self.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32)) + exp = np.array([-1, 2, 1, 3], dtype=np.int32) + self.assert_numpy_array_equal(res, exp) + + idx = sparray._make_index(4, np.array([0, 2, 3], dtype=np.int32), + kind=kind) + res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32)) + exp = np.array([1, -1, 2, 0], dtype=np.int32) + self.assert_numpy_array_equal(res, exp) + + res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32)) + exp = np.array([-1, -1, 1, -1], dtype=np.int32) + self.assert_numpy_array_equal(res, exp) + + class TestSparseArray(tm.TestCase): + _multiprocess_can_split_ = True def setUp(self): @@ -44,17 +183,114 @@ def test_take(self): self.assertEqual(self.arr.take(2), np.take(self.arr_data, 2)) self.assertEqual(self.arr.take(6), np.take(self.arr_data, 6)) - tm.assert_sp_array_equal(self.arr.take([2, 3]), - SparseArray(np.take(self.arr_data, - [2, 3]))) - tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), - SparseArray(np.take(self.arr_data, - [0, 1, 2]))) + exp = SparseArray(np.take(self.arr_data, [2, 3])) + tm.assert_sp_array_equal(self.arr.take([2, 3]), exp) + + exp = SparseArray(np.take(self.arr_data, [0, 1, 2])) + tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp) + + def test_take_fill_value(self): + data = np.array([1, np.nan, 0, 3, 0]) + sparse = SparseArray(data, fill_value=0) + + exp = SparseArray(np.take(data, [0]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([0]), exp) + + exp = SparseArray(np.take(data, [1, 3, 4]), fill_value=0) + tm.assert_sp_array_equal(sparse.take([1, 3, 4]), exp) + + def test_take_negative(self): + exp = SparseArray(np.take(self.arr_data, [-1])) + tm.assert_sp_array_equal(self.arr.take([-1]), exp) + + exp = SparseArray(np.take(self.arr_data, [-4, -3, -2])) + tm.assert_sp_array_equal(self.arr.take([-4, -3, -2]), exp) def test_bad_take(self): assertRaisesRegexp(IndexError, "bounds", lambda: self.arr.take(11)) self.assertRaises(IndexError, lambda: self.arr.take(-11)) + def test_take_filling(self): + # similar tests as GH 12631 + sparse = SparseArray([np.nan, np.nan, 1, np.nan, 4]) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + # fill_value + result = sparse.take(np.array([1, 0, -1]), fill_value=True) + expected = SparseArray([np.nan, np.nan, np.nan]) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), + allow_fill=False, fill_value=True) + expected = SparseArray([np.nan, np.nan, 4]) + tm.assert_sp_array_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assertRaisesRegexp(ValueError, msg): + sparse.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + sparse.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + sparse.take(np.array([1, -6])) + with tm.assertRaises(IndexError): + sparse.take(np.array([1, 5])) + with tm.assertRaises(IndexError): + sparse.take(np.array([1, 5]), fill_value=True) + + def test_take_filling_fill_value(self): + # same tests as GH 12631 + sparse = SparseArray([np.nan, 0, 1, 0, 4], fill_value=0) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # fill_value + result = sparse.take(np.array([1, 0, -1]), fill_value=True) + expected = SparseArray([0, np.nan, 0], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + # allow_fill=False + result = sparse.take(np.array([1, 0, -1]), + allow_fill=False, fill_value=True) + expected = SparseArray([0, np.nan, 4], fill_value=0) + tm.assert_sp_array_equal(result, expected) + + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + with tm.assertRaisesRegexp(ValueError, msg): + sparse.take(np.array([1, 0, -2]), fill_value=True) + with tm.assertRaisesRegexp(ValueError, msg): + sparse.take(np.array([1, 0, -5]), fill_value=True) + + with tm.assertRaises(IndexError): + sparse.take(np.array([1, -6])) + with tm.assertRaises(IndexError): + sparse.take(np.array([1, 5])) + with tm.assertRaises(IndexError): + sparse.take(np.array([1, 5]), fill_value=True) + + def test_take_filling_all_nan(self): + sparse = SparseArray([np.nan, np.nan, np.nan, np.nan, np.nan]) + result = sparse.take(np.array([1, 0, -1])) + expected = SparseArray([np.nan, np.nan, np.nan]) + tm.assert_sp_array_equal(result, expected) + + result = sparse.take(np.array([1, 0, -1]), fill_value=True) + expected = SparseArray([np.nan, np.nan, np.nan]) + tm.assert_sp_array_equal(result, expected) + + with tm.assertRaises(IndexError): + sparse.take(np.array([1, -6])) + with tm.assertRaises(IndexError): + sparse.take(np.array([1, 5])) + with tm.assertRaises(IndexError): + sparse.take(np.array([1, 5]), fill_value=True) + def test_set_item(self): def setitem(): self.arr[5] = 3 diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index 0e218d2639662..fb89d4486b890 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -10,6 +10,51 @@ class TestSparseSeriesIndexing(tm.TestCase): _multiprocess_can_split_ = True + def test_getitem(self): + orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) + sparse = orig.to_sparse() + + self.assertEqual(sparse[0], 1) + self.assertTrue(np.isnan(sparse[1])) + self.assertEqual(sparse[3], 3) + + result = sparse[[1, 3, 4]] + exp = orig[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse[orig % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse[sparse % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_getitem_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0]) + sparse = orig.to_sparse(fill_value=0) + + self.assertEqual(sparse[0], 1) + self.assertTrue(np.isnan(sparse[1])) + self.assertEqual(sparse[2], 0) + self.assertEqual(sparse[3], 3) + + result = sparse[[1, 3, 4]] + exp = orig[[1, 3, 4]].to_sparse(fill_value=0) + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse[orig % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse(fill_value=0) + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse[sparse % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse(fill_value=0) + tm.assert_sp_series_equal(result, exp) + def test_loc(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) sparse = orig.to_sparse() @@ -59,11 +104,38 @@ def test_loc_index(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + def test_loc_index_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + sparse = orig.to_sparse(fill_value=0) + + self.assertEqual(sparse.loc['A'], 1) + self.assertTrue(np.isnan(sparse.loc['B'])) + + result = sparse.loc[['A', 'C', 'D']] + exp = orig.loc[['A', 'C', 'D']].to_sparse(fill_value=0) + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse.loc[orig % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse(fill_value=0) + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse.loc[sparse % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse(fill_value=0) + tm.assert_sp_series_equal(result, exp) + def test_loc_slice(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) sparse = orig.to_sparse() tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) + def test_loc_slice_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0]) + sparse = orig.to_sparse(fill_value=0) + tm.assert_sp_series_equal(sparse.loc[2:], + orig.loc[2:].to_sparse(fill_value=0)) + def test_iloc(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) sparse = orig.to_sparse() @@ -75,14 +147,114 @@ def test_iloc(self): exp = orig.iloc[[1, 3, 4]].to_sparse() tm.assert_sp_series_equal(result, exp) + result = sparse.iloc[[1, -2, -4]] + exp = orig.iloc[[1, -2, -4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + with tm.assertRaises(IndexError): sparse.iloc[[1, 3, 5]] + def test_iloc_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0]) + sparse = orig.to_sparse(fill_value=0) + + self.assertEqual(sparse.iloc[3], 3) + self.assertTrue(np.isnan(sparse.iloc[1])) + self.assertEqual(sparse.iloc[4], 0) + + result = sparse.iloc[[1, 3, 4]] + exp = orig.iloc[[1, 3, 4]].to_sparse(fill_value=0) + tm.assert_sp_series_equal(result, exp) + def test_iloc_slice(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) sparse = orig.to_sparse() tm.assert_sp_series_equal(sparse.iloc[2:], orig.iloc[2:].to_sparse()) + def test_iloc_slice_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0]) + sparse = orig.to_sparse(fill_value=0) + tm.assert_sp_series_equal(sparse.iloc[2:], + orig.iloc[2:].to_sparse(fill_value=0)) + + def test_at(self): + orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) + sparse = orig.to_sparse() + self.assertEqual(sparse.at[0], orig.at[0]) + self.assertTrue(np.isnan(sparse.at[1])) + self.assertTrue(np.isnan(sparse.at[2])) + self.assertEqual(sparse.at[3], orig.at[3]) + self.assertTrue(np.isnan(sparse.at[4])) + + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], + index=list('abcde')) + sparse = orig.to_sparse() + self.assertEqual(sparse.at['a'], orig.at['a']) + self.assertTrue(np.isnan(sparse.at['b'])) + self.assertTrue(np.isnan(sparse.at['c'])) + self.assertEqual(sparse.at['d'], orig.at['d']) + self.assertTrue(np.isnan(sparse.at['e'])) + + def test_at_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0], + index=list('abcde')) + sparse = orig.to_sparse(fill_value=0) + self.assertEqual(sparse.at['a'], orig.at['a']) + self.assertTrue(np.isnan(sparse.at['b'])) + self.assertEqual(sparse.at['c'], orig.at['c']) + self.assertEqual(sparse.at['d'], orig.at['d']) + self.assertEqual(sparse.at['e'], orig.at['e']) + + def test_iat(self): + orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) + sparse = orig.to_sparse() + self.assertEqual(sparse.iat[0], orig.iat[0]) + self.assertTrue(np.isnan(sparse.iat[1])) + self.assertTrue(np.isnan(sparse.iat[2])) + self.assertEqual(sparse.iat[3], orig.iat[3]) + self.assertTrue(np.isnan(sparse.iat[4])) + + self.assertTrue(np.isnan(sparse.iat[-1])) + self.assertEqual(sparse.iat[-5], orig.iat[-5]) + + def test_iat_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0]) + sparse = orig.to_sparse() + self.assertEqual(sparse.iat[0], orig.iat[0]) + self.assertTrue(np.isnan(sparse.iat[1])) + self.assertEqual(sparse.iat[2], orig.iat[2]) + self.assertEqual(sparse.iat[3], orig.iat[3]) + self.assertEqual(sparse.iat[4], orig.iat[4]) + + self.assertEqual(sparse.iat[-1], orig.iat[-1]) + self.assertEqual(sparse.iat[-5], orig.iat[-5]) + + def test_take(self): + orig = pd.Series([1, np.nan, np.nan, 3, np.nan], + index=list('ABCDE')) + sparse = orig.to_sparse() + + tm.assert_sp_series_equal(sparse.take([0]), + orig.take([0]).to_sparse()) + tm.assert_sp_series_equal(sparse.take([0, 1, 3]), + orig.take([0, 1, 3]).to_sparse()) + tm.assert_sp_series_equal(sparse.take([-1, -2]), + orig.take([-1, -2]).to_sparse()) + + def test_take_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0], + index=list('ABCDE')) + sparse = orig.to_sparse(fill_value=0) + + tm.assert_sp_series_equal(sparse.take([0]), + orig.take([0]).to_sparse(fill_value=0)) + + exp = orig.take([0, 1, 3]).to_sparse(fill_value=0) + tm.assert_sp_series_equal(sparse.take([0, 1, 3]), exp) + + exp = orig.take([-1, -2]).to_sparse(fill_value=0) + tm.assert_sp_series_equal(sparse.take([-1, -2]), exp) + class TestSparseDataFrameIndexing(tm.TestCase): diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 6744c6e5a4e07..4797f3ce71618 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -1,4 +1,4 @@ -from numpy cimport ndarray, int32_t, float64_t +from numpy cimport ndarray, uint8_t, int32_t, float64_t cimport numpy as np cimport cython @@ -177,12 +177,21 @@ cdef class IntIndex(SparseIndex): return IntIndex(x.length, new_list) @cython.wraparound(False) - cpdef lookup(self, Py_ssize_t index): + cpdef int lookup(self, Py_ssize_t index): + """ + Return the internal location if value exists on given index. + Return -1 otherwise. + """ cdef: - Py_ssize_t res, n, cum_len = 0 + Py_ssize_t res ndarray[int32_t, ndim=1] inds inds = self.indices + if self.npoints == 0: + return -1 + elif index < 0 or self.length <= index: + return -1 + res = inds.searchsorted(index) if res == self.npoints: return -1 @@ -191,6 +200,36 @@ cdef class IntIndex(SparseIndex): else: return -1 + @cython.wraparound(False) + cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + """ + Vectorized lookup, returns ndarray[int32_t] + """ + cdef: + Py_ssize_t n, i, ind_val + ndarray[int32_t, ndim=1] inds + ndarray[uint8_t, ndim=1, cast=True] mask + ndarray[int32_t, ndim=1] masked + ndarray[int32_t, ndim=1] res + ndarray[int32_t, ndim=1] results + + n = len(indexer) + results = np.empty(n, dtype=np.int32) + results.fill(-1) + + if self.npoints == 0: + return results + + inds = self.indices + mask = (inds[0] <= indexer) & (indexer <= inds[len(inds) - 1]) + + masked = indexer[mask] + res = inds.searchsorted(masked).astype(np.int32) + + res[inds[res] != masked] = -1 + results[mask] = res + return results + cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values, float64_t fill_value, SparseIndex other_): cdef: @@ -475,11 +514,11 @@ cdef class BlockIndex(SparseIndex): ''' return BlockUnion(self, y.to_block_index()).result - cpdef lookup(self, Py_ssize_t index): - ''' - - Returns -1 if not found - ''' + cpdef int lookup(self, Py_ssize_t index): + """ + Return the internal location if value exists on given index. + Return -1 otherwise. + """ cdef: Py_ssize_t i, cum_len ndarray[int32_t, ndim=1] locs, lens @@ -500,6 +539,36 @@ cdef class BlockIndex(SparseIndex): return -1 + @cython.wraparound(False) + cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + """ + Vectorized lookup, returns ndarray[int32_t] + """ + cdef: + Py_ssize_t n, i, j, ind_val + ndarray[int32_t, ndim=1] locs, lens + ndarray[int32_t, ndim=1] results + + locs = self.blocs + lens = self.blengths + + n = len(indexer) + results = np.empty(n, dtype=np.int32) + results.fill(-1) + + if self.npoints == 0: + return results + + for i from 0 <= i < n: + ind_val = indexer[i] + if not (ind_val < 0 or self.length <= ind_val): + cum_len = 0 + for j from 0 <= j < self.nblocks: + if ind_val >= locs[j] and ind_val < locs[j] + lens[j]: + results[i] = cum_len + ind_val - locs[j] + cum_len += lens[j] + return results + cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values, float64_t fill_value, SparseIndex other_): cdef: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 1d479868c00a6..788fb4027be84 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1209,7 +1209,10 @@ def assert_sp_array_equal(left, right): # SparseIndex comparison assertIsInstance(left.sp_index, pd._sparse.SparseIndex, '[SparseIndex]') assertIsInstance(right.sp_index, pd._sparse.SparseIndex, '[SparseIndex]') - assert (left.sp_index.equals(right.sp_index)) + + if not left.sp_index.equals(right.sp_index): + raise_assert_detail('SparseArray.index', 'index are not equal', + left.sp_index, right.sp_index) if np.isnan(left.fill_value): assert (np.isnan(right.fill_value))