From 96394f36cfbe73a108f32b6dcfa8770a49d66d6a Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Thu, 28 Sep 2017 23:11:25 +0900 Subject: [PATCH] BUG: Fix make_sparse mask generation (#17574) --- asv_bench/benchmarks/sparse.py | 65 ++++++++++++++++++++++++++++++- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/_libs/sparse.pyx | 19 +++++++++ pandas/core/sparse/array.py | 9 ++++- pandas/tests/sparse/test_array.py | 9 +++++ 5 files changed, 101 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index b958f5e0e5c342..a46205026481e5 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -2,7 +2,7 @@ from .pandas_vb_common import * import scipy.sparse -from pandas import SparseSeries, SparseDataFrame +from pandas import SparseSeries, SparseDataFrame, SparseArray class sparse_series_to_frame(object): @@ -23,6 +23,69 @@ def time_sparse_series_to_frame(self): SparseDataFrame(self.series) +class sparse_array_constructor(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1) + self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64) + self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64) + + self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64) + self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64) + + self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan) + self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan) + + self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) + self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) + + def make_numeric_array(self, length, dense_size, fill_value, dtype): + arr = np.array([fill_value] * length, dtype=dtype) + indexer = np.unique(np.random.randint(0, length, dense_size)) + arr[indexer] = np.random.randint(0, 100, len(indexer)) + return (arr, fill_value, dtype) + + def make_object_array(self, length, dense_size, fill_value): + elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object) + arr = np.array([fill_value] * length, dtype=np.object) + indexer = np.unique(np.random.randint(0, length, dense_size)) + arr[indexer] = np.random.choice(elems, len(indexer)) + return (arr, fill_value, np.object) + + def time_sparse_array_constructor_int64_10percent(self): + arr, fill_value, dtype = self.int64_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_int64_1percent(self): + arr, fill_value, dtype = self.int64_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_float64_10percent(self): + arr, fill_value, dtype = self.float64_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_float64_1percent(self): + arr, fill_value, dtype = self.float64_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_nan_fill_value_10percent(self): + arr, fill_value, dtype = self.object_nan_fill_value_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_nan_fill_value_1percent(self): + arr, fill_value, dtype = self.object_nan_fill_value_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self): + arr, fill_value, dtype = self.object_non_nan_fill_value_10percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self): + arr, fill_value, dtype = self.object_non_nan_fill_value_1percent + SparseArray(arr, fill_value=fill_value, dtype=dtype) + + class sparse_frame_constructor(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7f7775e2389665..7572c8fd196c81 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -631,6 +631,7 @@ Sparse - Bug in ``SparseSeries`` raises ``AttributeError`` when a dictionary is passed in as data (:issue:`16905`) - Bug in :func:`SparseDataFrame.fillna` not filling all NaNs when frame was instantiated from SciPy sparse matrix (:issue:`16112`) - Bug in :func:`SparseSeries.unstack` and :func:`SparseDataFrame.stack` (:issue:`16614`, :issue:`15045`) +- Bug in :func:`make_sparse` treating two numeric/boolean data, which have same bits, as same when array ``dtype`` is ``object`` (:issue:`17574`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 1cc7f5ace95ea5..fac678e531c8be 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -848,3 +848,22 @@ def reindex_integer(ndarray[float64_t, ndim=1] values, IntIndex sparse_index, ndarray[int32_t, ndim=1] indexer): pass + + +# ----------------------------------------------------------------------------- +# SparseArray mask create operations + +def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value): + cdef object value + cdef Py_ssize_t i + cdef Py_ssize_t new_length = len(arr) + cdef ndarray[int8_t, ndim=1] mask + + mask = np.ones(new_length, dtype=np.int8) + + for i in range(new_length): + value = arr[i] + if value == fill_value and type(value) == type(fill_value): + mask[i] = 0 + + return mask.view(dtype=np.bool) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index f965c91999a03d..3b45a013734c91 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( _ensure_platform_int, is_float, is_integer, + is_object_dtype, is_integer_dtype, is_bool_dtype, is_list_like, @@ -789,7 +790,13 @@ def make_sparse(arr, kind='block', fill_value=None): if is_string_dtype(arr): arr = arr.astype(object) - mask = arr != fill_value + if is_object_dtype(arr.dtype): + # element-wise equality check method in numpy doesn't treat + # each element type, eg. 0, 0.0, and False are treated as + # same. So we have to check the both of its type and value. + mask = splib.make_mask_object_ndarray(arr, fill_value) + else: + mask = arr != fill_value length = len(arr) if length != mask.size: diff --git a/pandas/tests/sparse/test_array.py b/pandas/tests/sparse/test_array.py index b0a9182a265fe8..f653ee50982ad1 100644 --- a/pandas/tests/sparse/test_array.py +++ b/pandas/tests/sparse/test_array.py @@ -61,6 +61,15 @@ def test_constructor_object_dtype(self): assert arr.dtype == np.object assert arr.fill_value == 'A' + # GH 17574 + data = [False, 0, 100.0, 0.0] + arr = SparseArray(data, dtype=np.object, fill_value=False) + assert arr.dtype == np.object + assert arr.fill_value is False + arr_expected = np.array(data, dtype=np.object) + it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) + assert np.fromiter(it, dtype=np.bool).all() + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan]))