diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6fe0ad8092a037..8b615e19e49b5e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -60,6 +60,7 @@ Backwards incompatible API changes - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) +- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 4bb36446c9ff7b..0ef10ac52e8899 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -620,19 +620,28 @@ def sum(self, axis=0, *args, **kwargs): def cumsum(self, axis=0, *args, **kwargs): """ - Cumulative sum of values. Preserves locations of NaN values + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseArray will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : int + Axis over which to perform the cumulative summation. Currently, + this parameter is ignored because `SparseArray` only works with + 1-D array-like objects. Returns ------- - cumsum : Series + cumsum : SparseArray """ nv.validate_cumsum(args, kwargs) - # TODO: gh-12855 - return a SparseArray here - if notnull(self.fill_value): - return self.to_dense().cumsum() + if not self._null_fill_value: + return SparseArray(self.to_dense()).cumsum() - # TODO: what if sp_values contains NaN?? return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, fill_value=self.fill_value) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 660f76ff1001d0..d13e393d10d90b 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -630,21 +630,30 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs): def cumsum(self, axis=0, *args, **kwargs): """ - Cumulative sum of values. Preserves locations of NaN values + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseSeries will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : int + Axis over which to perform the cumulative summation. This + parameter is ignored because `SparseSeries` is 1-D, but it + is kept in the signature for consistency with other sparse + array-like objects. Returns ------- - cumsum : SparseSeries if `self` has a null `fill_value` and a - generic Series otherwise + cumsum : SparseSeries """ nv.validate_cumsum(args, kwargs) - new_array = SparseArray.cumsum(self.values) - if isinstance(new_array, SparseArray): - return self._constructor( - new_array, index=self.index, - sparse_index=new_array.sp_index).__finalize__(self) - # TODO: gh-12855 - return a SparseSeries here - return Series(new_array, index=self.index).__finalize__(self) + new_array = self.values.cumsum() + + return self._constructor( + new_array, index=self.index, + sparse_index=new_array.sp_index).__finalize__(self) @Appender(generic._shared_docs['isnull']) def isnull(self): diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 1c9b6119cf665b..bfab98a76c5513 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -688,46 +688,52 @@ def test_numpy_sum(self): SparseArray(data), out=out) def test_cumsum(self): - data = np.arange(10).astype(float) - out = SparseArray(data).cumsum() - expected = SparseArray(data.cumsum()) - tm.assert_sp_array_equal(out, expected) + non_null_data = np.array([1, 2, 3, 4, 5], dtype=float) + non_null_expected = SparseArray(non_null_data.cumsum()) - # TODO: gh-12855 - return a SparseArray here - data[5] = np.nan - out = SparseArray(data, fill_value=2).cumsum() - self.assertNotIsInstance(out, SparseArray) - tm.assert_numpy_array_equal(out, data.cumsum()) + null_data = np.array([1, 2, np.nan, 4, 5], dtype=float) + null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])) + + for data, expected in [ + (null_data, null_expected), + (non_null_data, non_null_expected) + ]: + out = SparseArray(data).cumsum() + tm.assert_sp_array_equal(out, expected) - out = SparseArray(data, fill_value=np.nan).cumsum() - expected = SparseArray(np.array([ - 0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40])) - tm.assert_sp_array_equal(out, expected) + out = SparseArray(data, fill_value=np.nan).cumsum() + tm.assert_sp_array_equal(out, expected) + + out = SparseArray(data, fill_value=2).cumsum() + tm.assert_sp_array_equal(out, expected) def test_numpy_cumsum(self): - data = np.arange(10).astype(float) - out = np.cumsum(SparseArray(data)) - expected = SparseArray(data.cumsum()) - tm.assert_sp_array_equal(out, expected) + non_null_data = np.array([1, 2, 3, 4, 5], dtype=float) + non_null_expected = SparseArray(non_null_data.cumsum()) - # TODO: gh-12855 - return a SparseArray here - data[5] = np.nan - out = np.cumsum(SparseArray(data, fill_value=2)) - self.assertNotIsInstance(out, SparseArray) - tm.assert_numpy_array_equal(out, data.cumsum()) + null_data = np.array([1, 2, np.nan, 4, 5], dtype=float) + null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])) - out = np.cumsum(SparseArray(data, fill_value=np.nan)) - expected = SparseArray(np.array([ - 0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40])) - tm.assert_sp_array_equal(out, expected) + for data, expected in [ + (null_data, null_expected), + (non_null_data, non_null_expected) + ]: + out = np.cumsum(SparseArray(data)) + tm.assert_sp_array_equal(out, expected) - msg = "the 'dtype' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.cumsum, - SparseArray(data), dtype=np.int64) + out = np.cumsum(SparseArray(data, fill_value=np.nan)) + tm.assert_sp_array_equal(out, expected) - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.cumsum, - SparseArray(data), out=out) + out = np.cumsum(SparseArray(data, fill_value=2)) + tm.assert_sp_array_equal(out, expected) + + msg = "the 'dtype' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.cumsum, + SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.cumsum, + SparseArray(data), out=out) def test_mean(self): data = np.arange(10).astype(float) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 116596e36b4029..b901989f820594 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -1331,10 +1331,8 @@ def test_cumsum(self): expected = SparseSeries(self.bseries.to_dense().cumsum()) tm.assert_sp_series_equal(result, expected) - # TODO: gh-12855 - return a SparseSeries here result = self.zbseries.cumsum() expected = self.zbseries.to_dense().cumsum() - self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) def test_numpy_cumsum(self): @@ -1342,10 +1340,8 @@ def test_numpy_cumsum(self): expected = SparseSeries(self.bseries.to_dense().cumsum()) tm.assert_sp_series_equal(result, expected) - # TODO: gh-12855 - return a SparseSeries here result = np.cumsum(self.zbseries) expected = self.zbseries.to_dense().cumsum() - self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) msg = "the 'dtype' parameter is not supported"