Skip to content

Commit

Permalink
BUG: Fix make_sparse mask generation (pandas-dev#17574)
Browse files Browse the repository at this point in the history
  • Loading branch information
Licht-T authored and No-Stream committed Nov 28, 2017
1 parent 8dafef1 commit 5879aa5
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 2 deletions.
65 changes: 64 additions & 1 deletion asv_bench/benchmarks/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .pandas_vb_common import *
import scipy.sparse
from pandas import SparseSeries, SparseDataFrame
from pandas import SparseSeries, SparseDataFrame, SparseArray


class sparse_series_to_frame(object):
Expand All @@ -23,6 +23,69 @@ def time_sparse_series_to_frame(self):
SparseDataFrame(self.series)


class sparse_array_constructor(object):
goal_time = 0.2

def setup(self):
np.random.seed(1)
self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64)
self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64)

self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64)
self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64)

self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan)
self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan)

self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0)
self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0)

def make_numeric_array(self, length, dense_size, fill_value, dtype):
arr = np.array([fill_value] * length, dtype=dtype)
indexer = np.unique(np.random.randint(0, length, dense_size))
arr[indexer] = np.random.randint(0, 100, len(indexer))
return (arr, fill_value, dtype)

def make_object_array(self, length, dense_size, fill_value):
elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object)
arr = np.array([fill_value] * length, dtype=np.object)
indexer = np.unique(np.random.randint(0, length, dense_size))
arr[indexer] = np.random.choice(elems, len(indexer))
return (arr, fill_value, np.object)

def time_sparse_array_constructor_int64_10percent(self):
arr, fill_value, dtype = self.int64_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_int64_1percent(self):
arr, fill_value, dtype = self.int64_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_float64_10percent(self):
arr, fill_value, dtype = self.float64_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_float64_1percent(self):
arr, fill_value, dtype = self.float64_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_object_nan_fill_value_10percent(self):
arr, fill_value, dtype = self.object_nan_fill_value_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_object_nan_fill_value_1percent(self):
arr, fill_value, dtype = self.object_nan_fill_value_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self):
arr, fill_value, dtype = self.object_non_nan_fill_value_10percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)

def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self):
arr, fill_value, dtype = self.object_non_nan_fill_value_1percent
SparseArray(arr, fill_value=fill_value, dtype=dtype)


class sparse_frame_constructor(object):
goal_time = 0.2

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,7 @@ Sparse
- Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`)
- Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`)
- Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`)
- Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`)

Reshaping
^^^^^^^^^
Expand Down
19 changes: 19 additions & 0 deletions pandas/_libs/sparse.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -848,3 +848,22 @@ def reindex_integer(ndarray[float64_t, ndim=1] values,
IntIndex sparse_index,
ndarray[int32_t, ndim=1] indexer):
pass


# -----------------------------------------------------------------------------
# SparseArray mask create operations

def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value):
cdef object value
cdef Py_ssize_t i
cdef Py_ssize_t new_length = len(arr)
cdef ndarray[int8_t, ndim=1] mask

mask = np.ones(new_length, dtype=np.int8)

for i in range(new_length):
value = arr[i]
if value == fill_value and type(value) == type(fill_value):
mask[i] = 0

return mask.view(dtype=np.bool)
9 changes: 8 additions & 1 deletion pandas/core/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pandas.core.dtypes.common import (
_ensure_platform_int,
is_float, is_integer,
is_object_dtype,
is_integer_dtype,
is_bool_dtype,
is_list_like,
Expand Down Expand Up @@ -789,7 +790,13 @@ def make_sparse(arr, kind='block', fill_value=None):
if is_string_dtype(arr):
arr = arr.astype(object)

mask = arr != fill_value
if is_object_dtype(arr.dtype):
# element-wise equality check method in numpy doesn't treat
# each element type, eg. 0, 0.0, and False are treated as
# same. So we have to check the both of its type and value.
mask = splib.make_mask_object_ndarray(arr, fill_value)
else:
mask = arr != fill_value

length = len(arr)
if length != mask.size:
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ def test_constructor_object_dtype(self):
assert arr.dtype == np.object
assert arr.fill_value == 'A'

# GH 17574
data = [False, 0, 100.0, 0.0]
arr = SparseArray(data, dtype=np.object, fill_value=False)
assert arr.dtype == np.object
assert arr.fill_value is False
arr_expected = np.array(data, dtype=np.object)
it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected))
assert np.fromiter(it, dtype=np.bool).all()

def test_constructor_spindex_dtype(self):
arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]))
tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan]))
Expand Down

0 comments on commit 5879aa5

Please sign in to comment.