ENH/PERF SparseArray.take indexing

pandas-dev · Apr 7, 2016 · df1f056 · df1f056
1 parent 522e16d
commit df1f056
Show file tree

Hide file tree

Showing 9 changed files with 567 additions and 69 deletions.
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -81,7 +81,7 @@ These changes conform sparse handling to return the correct types and work to ma
 - Bug in ``SparseSeries.__repr__`` raises ``TypeError`` when it is longer than ``max_rows`` (:issue:`10560`)
 - Bug in ``SparseSeries.shape`` ignores ``fill_value`` (:issue:`10452`)
 - Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
-- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others (:issue:`10560`)
+- ``SparseArray.take`` now returns scalar for scalar input, ``SparseArray`` for others. Also now it handles negative indexer as the same rule as ``Index`` (:issue:`10560`, :issue:`12796`)
 
   .. ipython:: python
 

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -809,9 +809,6 @@ def _set_values(self, key, value):
         self._data = self._data.setitem(indexer=key, value=value)
         self._maybe_update_cacher()
 
-    # help out SparseSeries
-    _get_val_at = ndarray.__getitem__
-
     def repeat(self, reps):
         """
         return a new Series with the values repeated reps times

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -1332,7 +1332,7 @@ def _ensure_compat_concat(indexes):
         return indexes
 
     _index_shared_docs['take'] = """
-        return a new Index of the values selected by the indices
+        return a new %(klass)s of the values selected by the indices
 
         For internal compatibility with numpy arrays.
 
@@ -1352,7 +1352,7 @@ def _ensure_compat_concat(indexes):
         numpy.ndarray.take
         """
 
-    @Appender(_index_shared_docs['take'])
+    @Appender(_index_shared_docs['take'] % _index_doc_kwargs)
     def take(self, indices, axis=0, allow_fill=True, fill_value=None):
         indices = com._ensure_platform_int(indices)
         if self._can_hold_na:

diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py
@@ -13,11 +13,16 @@
 from pandas import compat, lib
 from pandas.compat import range
 
-from pandas._sparse import BlockIndex, IntIndex
+from pandas._sparse import SparseIndex, BlockIndex, IntIndex
 import pandas._sparse as splib
 import pandas.index as _index
 import pandas.core.ops as ops
 import pandas.formats.printing as printing
+from pandas.util.decorators import Appender
+from pandas.indexes.base import _index_shared_docs
+
+
+_sparray_doc_kwargs = dict(klass='SparseArray')
 
 
 def _arith_method(op, name, str_rep=None, default_axis=None, fill_zeros=None,
@@ -167,10 +172,19 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer',
                 fill_value = bool(fill_value)
 
         # Change the class of the array to be the subclass type.
-        output = subarr.view(cls)
-        output.sp_index = sparse_index
-        output.fill_value = fill_value
-        return output
+        return cls._simple_new(subarr, sparse_index, fill_value)
+
+    @classmethod
+    def _simple_new(cls, data, sp_index, fill_value):
+        result = data.view(cls)
+
+        if not isinstance(sp_index, SparseIndex):
+            # caller must pass SparseIndex
+            raise ValueError('sp_index must be a SparseIndex')
+
+        result.sp_index = sp_index
+        result.fill_value = fill_value
+        return result
 
     @property
     def _constructor(self):
@@ -308,46 +322,53 @@ def _get_val_at(self, loc):
         else:
             return _index.get_value_at(self, sp_loc)
 
-    def take(self, indices, axis=0):
-        """
-        Sparse-compatible version of ndarray.take
+    @Appender(_index_shared_docs['take'] % _sparray_doc_kwargs)
+    def take(self, indices, axis=0, allow_fill=True,
+             fill_value=None):
+
+        # Sparse-compatible version of ndarray.take, returns SparseArray
 
-        Returns
-        -------
-        taken : ndarray
-        """
         if axis:
             raise ValueError("axis must be 0, input was {0}".format(axis))
 
         if com.is_integer(indices):
             # return scalar
             return self[indices]
 
-        indices = np.atleast_1d(np.asarray(indices, dtype=int))
-
-        # allow -1 to indicate missing values
+        indices = com._ensure_platform_int(indices)
         n = len(self)
-        if ((indices >= n) | (indices < -1)).any():
-            raise IndexError('out of bounds access')
-
-        if self.sp_index.npoints > 0:
-            locs = np.array([self.sp_index.lookup(loc) if loc > -1 else -1
-                             for loc in indices])
-            result = self.sp_values.take(locs)
-            mask = locs == -1
-            if mask.any():
-                try:
-                    result[mask] = self.fill_value
-                except ValueError:
-                    # wrong dtype
-                    result = result.astype('float64')
-                    result[mask] = self.fill_value
-
+        if allow_fill and fill_value is not None:
+            # allow -1 to indicate self.fill_value,
+            # self.fill_value may not be NaN
+            if (indices < -1).any():
+                msg = ('When allow_fill=True and fill_value is not None, '
+                       'all indices must be >= -1')
+                raise ValueError(msg)
+            elif (n <= indices).any():
+                msg = 'index is out of bounds for size {0}'
+                raise IndexError(msg.format(n))
+        else:
+            if ((indices < -n) | (n <= indices)).any():
+                msg = 'index is out of bounds for size {0}'
+                raise IndexError(msg.format(n))
+
+        indices = indices.astype(np.int32)
+        if not (allow_fill and fill_value is not None):
+            indices = indices.copy()
+            indices[indices < 0] += n
+
+        locs = self.sp_index.lookup_array(indices)
+        indexer = np.arange(len(locs), dtype=np.int32)
+        mask = locs != -1
+        if mask.any():
+            indexer = indexer[mask]
+            new_values = self.sp_values.take(locs[mask])
         else:
-            result = np.empty(len(indices))
-            result.fill(self.fill_value)
+            indexer = np.empty(shape=(0, ), dtype=np.int32)
+            new_values = np.empty(shape=(0, ), dtype=self.sp_values.dtype)
 
-        return self._constructor(result)
+        sp_index = _make_index(len(indices), indexer, kind=self.sp_index)
+        return self._simple_new(new_values, sp_index, self.fill_value)
 
     def __setitem__(self, key, value):
         # if com.is_integer(key):
@@ -525,16 +546,21 @@ def make_sparse(arr, kind='block', fill_value=nan):
     else:
         indices = np.arange(length, dtype=np.int32)[mask]
 
-    if kind == 'block':
+    index = _make_index(length, indices, kind)
+    sparsified_values = arr[mask]
+    return sparsified_values, index
+
+
+def _make_index(length, indices, kind):
+
+    if kind == 'block' or isinstance(kind, BlockIndex):
         locs, lens = splib.get_blocks(indices)
         index = BlockIndex(length, locs, lens)
-    elif kind == 'integer':
+    elif kind == 'integer' or isinstance(kind, IntIndex):
         index = IntIndex(length, indices)
     else:  # pragma: no cover
         raise ValueError('must be block or integer type')
-
-    sparsified_values = arr[mask]
-    return sparsified_values, index
+    return index
 
 
 ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method,

diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py
@@ -165,10 +165,10 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block',
                 if index is None:
                     index = data.index.view()
                 else:
+
                     data = data.reindex(index, copy=False)
 
             else:
-
                 length = len(index)
 
                 if data == fill_value or (isnull(data) and isnull(fill_value)):
@@ -376,11 +376,6 @@ def _get_val_at(self, loc):
         """ forward to the array """
         return self.block.values._get_val_at(loc)
 
-    def _slice(self, slobj, axis=0, kind=None):
-        slobj = self.index._convert_slice_indexer(slobj,
-                                                  kind=kind or 'getitem')
-        return self._get_values(slobj)
-
     def __getitem__(self, key):
         """