diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index f534c67273560..556a22ddf46ad 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -70,6 +70,7 @@ Backwards incompatible API changes - ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) +- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 4bb36446c9ff7..da13726e88a14 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -620,19 +620,30 @@ def sum(self, axis=0, *args, **kwargs): def cumsum(self, axis=0, *args, **kwargs): """ - Cumulative sum of values. Preserves locations of NaN values + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseArray will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : int or None + Axis over which to perform the cumulative summation. If None, + perform cumulative summation over flattened array. Returns ------- - cumsum : Series + cumsum : SparseArray """ nv.validate_cumsum(args, kwargs) - # TODO: gh-12855 - return a SparseArray here - if notnull(self.fill_value): - return self.to_dense().cumsum() + if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. + raise ValueError("axis(={axis}) out of bounds".format(axis=axis)) + + if not self._null_fill_value: + return SparseArray(self.to_dense()).cumsum() - # TODO: what if sp_values contains NaN?? return SparseArray(self.sp_values.cumsum(), sparse_index=self.sp_index, fill_value=self.fill_value) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 660f76ff1001d..d6bc892921c42 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -630,21 +630,29 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs): def cumsum(self, axis=0, *args, **kwargs): """ - Cumulative sum of values. Preserves locations of NaN values + Cumulative sum of non-NA/null values. + + When performing the cumulative summation, any non-NA/null values will + be skipped. The resulting SparseSeries will preserve the locations of + NaN values, but the fill value will be `np.nan` regardless. + + Parameters + ---------- + axis : {0} Returns ------- - cumsum : SparseSeries if `self` has a null `fill_value` and a - generic Series otherwise + cumsum : SparseSeries """ nv.validate_cumsum(args, kwargs) - new_array = SparseArray.cumsum(self.values) - if isinstance(new_array, SparseArray): - return self._constructor( - new_array, index=self.index, - sparse_index=new_array.sp_index).__finalize__(self) - # TODO: gh-12855 - return a SparseSeries here - return Series(new_array, index=self.index).__finalize__(self) + if axis is not None: + axis = self._get_axis_number(axis) + + new_array = self.values.cumsum() + + return self._constructor( + new_array, index=self.index, + sparse_index=new_array.sp_index).__finalize__(self) @Appender(generic._shared_docs['isnull']) def isnull(self): diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 1c9b6119cf665..00476a3d946a8 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -688,46 +688,57 @@ def test_numpy_sum(self): SparseArray(data), out=out) def test_cumsum(self): - data = np.arange(10).astype(float) - out = SparseArray(data).cumsum() - expected = SparseArray(data.cumsum()) - tm.assert_sp_array_equal(out, expected) + non_null_data = np.array([1, 2, 3, 4, 5], dtype=float) + non_null_expected = SparseArray(non_null_data.cumsum()) - # TODO: gh-12855 - return a SparseArray here - data[5] = np.nan - out = SparseArray(data, fill_value=2).cumsum() - self.assertNotIsInstance(out, SparseArray) - tm.assert_numpy_array_equal(out, data.cumsum()) + null_data = np.array([1, 2, np.nan, 4, 5], dtype=float) + null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])) + + for data, expected in [ + (null_data, null_expected), + (non_null_data, non_null_expected) + ]: + out = SparseArray(data).cumsum() + tm.assert_sp_array_equal(out, expected) + + out = SparseArray(data, fill_value=np.nan).cumsum() + tm.assert_sp_array_equal(out, expected) - out = SparseArray(data, fill_value=np.nan).cumsum() - expected = SparseArray(np.array([ - 0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40])) - tm.assert_sp_array_equal(out, expected) + out = SparseArray(data, fill_value=2).cumsum() + tm.assert_sp_array_equal(out, expected) + + axis = 1 # SparseArray currently 1-D, so only axis = 0 is valid. + msg = "axis\(={axis}\) out of bounds".format(axis=axis) + with tm.assertRaisesRegexp(ValueError, msg): + SparseArray(data).cumsum(axis=axis) def test_numpy_cumsum(self): - data = np.arange(10).astype(float) - out = np.cumsum(SparseArray(data)) - expected = SparseArray(data.cumsum()) - tm.assert_sp_array_equal(out, expected) + non_null_data = np.array([1, 2, 3, 4, 5], dtype=float) + non_null_expected = SparseArray(non_null_data.cumsum()) - # TODO: gh-12855 - return a SparseArray here - data[5] = np.nan - out = np.cumsum(SparseArray(data, fill_value=2)) - self.assertNotIsInstance(out, SparseArray) - tm.assert_numpy_array_equal(out, data.cumsum()) + null_data = np.array([1, 2, np.nan, 4, 5], dtype=float) + null_expected = SparseArray(np.array([1.0, 3.0, np.nan, 7.0, 12.0])) - out = np.cumsum(SparseArray(data, fill_value=np.nan)) - expected = SparseArray(np.array([ - 0, 1, 3, 6, 10, np.nan, 16, 23, 31, 40])) - tm.assert_sp_array_equal(out, expected) + for data, expected in [ + (null_data, null_expected), + (non_null_data, non_null_expected) + ]: + out = np.cumsum(SparseArray(data)) + tm.assert_sp_array_equal(out, expected) - msg = "the 'dtype' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.cumsum, - SparseArray(data), dtype=np.int64) + out = np.cumsum(SparseArray(data, fill_value=np.nan)) + tm.assert_sp_array_equal(out, expected) - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.cumsum, - SparseArray(data), out=out) + out = np.cumsum(SparseArray(data, fill_value=2)) + tm.assert_sp_array_equal(out, expected) + + msg = "the 'dtype' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.cumsum, + SparseArray(data), dtype=np.int64) + + msg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.cumsum, + SparseArray(data), out=out) def test_mean(self): data = np.arange(10).astype(float) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 116596e36b402..14339ab388a5d 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -1331,21 +1331,22 @@ def test_cumsum(self): expected = SparseSeries(self.bseries.to_dense().cumsum()) tm.assert_sp_series_equal(result, expected) - # TODO: gh-12855 - return a SparseSeries here result = self.zbseries.cumsum() expected = self.zbseries.to_dense().cumsum() - self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) + axis = 1 # Series is 1-D, so only axis = 0 is valid. + msg = "No axis named {axis}".format(axis=axis) + with tm.assertRaisesRegexp(ValueError, msg): + self.bseries.cumsum(axis=axis) + def test_numpy_cumsum(self): result = np.cumsum(self.bseries) expected = SparseSeries(self.bseries.to_dense().cumsum()) tm.assert_sp_series_equal(result, expected) - # TODO: gh-12855 - return a SparseSeries here result = np.cumsum(self.zbseries) expected = self.zbseries.to_dense().cumsum() - self.assertNotIsInstance(result, SparseSeries) tm.assert_series_equal(result, expected) msg = "the 'dtype' parameter is not supported"