Skip to content

Commit

Permalink
ENH/PERF SparseArray.take indexing
Browse files Browse the repository at this point in the history
related to #4400

Added more tests for sparse indexing.
`SparseArray.take`` has optimized logic to omit dense ``np.ndarray`` creation.
SparseSeires.iloc` can work with negative indices.
Made ``SparseArray.take`` to handle negative indices as the same rule as ``Index`` (#12676)

Author: sinhrks <sinhrks@gmail.com>

Closes #12796 from sinhrks/sparse_test_at and squashes the following commits:

df1f056 [sinhrks] ENH/PERF SparseArray.take indexing
  • Loading branch information
sinhrks authored and jreback committed Apr 7, 2016
1 parent 8250d7c commit c90cdde
Show file tree
Hide file tree
Showing 9 changed files with 567 additions and 69 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ These changes conform sparse handling to return the correct types and work to ma
- Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
- Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others (:issue:`10560`)
- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others. Also now it handles negative indexer as the same rule as ``Index`` (:issue:`10560`, :issue:`12796`)

.. ipython:: python

Expand Down
3 changes: 0 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,9 +809,6 @@ def _set_values(self, key, value):
self._data = self._data.setitem(indexer=key, value=value)
self._maybe_update_cacher()

# help out SparseSeries
_get_val_at = ndarray.__getitem__

def repeat(self, reps):
"""
return a new Series with the values repeated reps times
Expand Down
4 changes: 2 additions & 2 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,7 @@ def _ensure_compat_concat(indexes):
return indexes

_index_shared_docs['take'] = """
return a new Index of the values selected by the indices
return a new %(klass)s of the values selected by the indices
For internal compatibility with numpy arrays.
Expand All @@ -1352,7 +1352,7 @@ def _ensure_compat_concat(indexes):
numpy.ndarray.take
"""

@Appender(_index_shared_docs['take'])
@Appender(_index_shared_docs['take'] % _index_doc_kwargs)
def take(self, indices, axis=0, allow_fill=True, fill_value=None):
indices = com._ensure_platform_int(indices)
if self._can_hold_na:
Expand Down
104 changes: 65 additions & 39 deletions pandas/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,16 @@
from pandas import compat, lib
from pandas.compat import range

from pandas._sparse import BlockIndex, IntIndex
from pandas._sparse import SparseIndex, BlockIndex, IntIndex
import pandas._sparse as splib
import pandas.index as _index
import pandas.core.ops as ops
import pandas.formats.printing as printing
from pandas.util.decorators import Appender
from pandas.indexes.base import _index_shared_docs


_sparray_doc_kwargs = dict(klass='SparseArray')


def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None,
Expand Down Expand Up @@ -167,10 +172,19 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer',
fill_value = bool(fill_value)

# Change the class of the array to be the subclass type.
output = subarr.view(cls)
output.sp_index = sparse_index
output.fill_value = fill_value
return output
return cls._simple_new(subarr, sparse_index, fill_value)

@classmethod
def _simple_new(cls, data, sp_index, fill_value):
result = data.view(cls)

if not isinstance(sp_index, SparseIndex):
# caller must pass SparseIndex
raise ValueError('sp_index must be a SparseIndex')

result.sp_index = sp_index
result.fill_value = fill_value
return result

@property
def _constructor(self):
Expand Down Expand Up @@ -308,46 +322,53 @@ def _get_val_at(self, loc):
else:
return _index.get_value_at(self, sp_loc)

def take(self, indices, axis=0):
"""
Sparse-compatible version of ndarray.take
@Appender(_index_shared_docs['take'] % _sparray_doc_kwargs)
def take(self, indices, axis=0, allow_fill=True,
fill_value=None):

# Sparse-compatible version of ndarray.take, returns SparseArray

Returns
-------
taken : ndarray
"""
if axis:
raise ValueError("axis must be 0, input was {0}".format(axis))

if com.is_integer(indices):
# return scalar
return self[indices]

indices = np.atleast_1d(np.asarray(indices, dtype=int))

# allow -1 to indicate missing values
indices = com._ensure_platform_int(indices)
n = len(self)
if ((indices >= n) | (indices < -1)).any():
raise IndexError('out of bounds access')

if self.sp_index.npoints > 0:
locs = np.array([self.sp_index.lookup(loc) if loc > -1 else -1
for loc in indices])
result = self.sp_values.take(locs)
mask = locs == -1
if mask.any():
try:
result[mask] = self.fill_value
except ValueError:
# wrong dtype
result = result.astype('float64')
result[mask] = self.fill_value

if allow_fill and fill_value is not None:
# allow -1 to indicate self.fill_value,
# self.fill_value may not be NaN
if (indices < -1).any():
msg = ('When allow_fill=True and fill_value is not None, '
'all indices must be >= -1')
raise ValueError(msg)
elif (n <= indices).any():
msg = 'index is out of bounds for size {0}'
raise IndexError(msg.format(n))
else:
if ((indices < -n) | (n <= indices)).any():
msg = 'index is out of bounds for size {0}'
raise IndexError(msg.format(n))

indices = indices.astype(np.int32)
if not (allow_fill and fill_value is not None):
indices = indices.copy()
indices[indices < 0] += n

locs = self.sp_index.lookup_array(indices)
indexer = np.arange(len(locs), dtype=np.int32)
mask = locs != -1
if mask.any():
indexer = indexer[mask]
new_values = self.sp_values.take(locs[mask])
else:
result = np.empty(len(indices))
result.fill(self.fill_value)
indexer = np.empty(shape=(0, ), dtype=np.int32)
new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)

return self._constructor(result)
sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
return self._simple_new(new_values, sp_index, self.fill_value)

def __setitem__(self, key, value):
# if com.is_integer(key):
Expand Down Expand Up @@ -525,16 +546,21 @@ def make_sparse(arr, kind='block', fill_value=nan):
else:
indices = np.arange(length, dtype=np.int32)[mask]

if kind == 'block':
index = _make_index(length, indices, kind)
sparsified_values = arr[mask]
return sparsified_values, index


def _make_index(length, indices, kind):

if kind == 'block' or isinstance(kind, BlockIndex):
locs, lens = splib.get_blocks(indices)
index = BlockIndex(length, locs, lens)
elif kind == 'integer':
elif kind == 'integer' or isinstance(kind, IntIndex):
index = IntIndex(length, indices)
else: # pragma: no cover
raise ValueError('must be block or integer type')

sparsified_values = arr[mask]
return sparsified_values, index
return index


ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,
Expand Down
7 changes: 1 addition & 6 deletions pandas/sparse/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
if index is None:
index = data.index.view()
else:

data = data.reindex(index, copy=False)

else:

length = len(index)

if data == fill_value or (isnull(data) and isnull(fill_value)):
Expand Down Expand Up @@ -376,11 +376,6 @@ def _get_val_at(self, loc):
""" forward to the array """
return self.block.values._get_val_at(loc)

def _slice(self, slobj, axis=0, kind=None):
slobj = self.index._convert_slice_indexer(slobj,
kind=kind or 'getitem')
return self._get_values(slobj)

def __getitem__(self, key):
"""
Expand Down
Loading

0 comments on commit c90cdde

Please sign in to comment.