diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b403508029f1f..7b94ee20eabb7 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -763,6 +763,7 @@ Numeric - Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`) - Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`) - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) +- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c4461a9530e5c..ea05bb4d9345c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11086,44 +11086,66 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - y = com.values_from_object(self).copy() - d = self._construct_axes_dict() - d["copy"] = False + if axis == 1: + return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T + + def na_accum_func(blk_values): + # We will be applying this function to block values + if blk_values.dtype.kind in ["m", "M"]: + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = blk_values.dtype + + # We need to define mask before masking NaTs + mask = isna(blk_values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = blk_values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = blk_values + changed = False + + result = accum_func(y.view("i8"), axis) + if skipna: + np.putmask(result, mask, iNaT) + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(blk_values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(blk_values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass( + blk_values.dtype.type, (np.integer, np.bool_) + ): + vals = blk_values.copy().T + mask = isna(vals) + np.putmask(vals, mask, mask_a) + result = accum_func(vals, axis) + np.putmask(result, mask, mask_b) + else: + result = accum_func(blk_values.T, axis) - if issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): - # numpy 1.18 started sorting NaTs at the end instead of beginning, - # so we need to work around to maintain backwards-consistency. - orig_dtype = y.dtype - if accum_func == np.minimum.accumulate: - # Note: the accum_func comparison fails as an "is" comparison - # Note that "y" is always a copy, so we can safely modify it - mask = isna(self) - y = y.view("i8") - y[mask] = np.iinfo(np.int64).max - - result = accum_func(y.view("i8"), axis).view(orig_dtype) - if skipna: - mask = isna(self) - np.putmask(result, mask, iNaT) - elif accum_func == np.minimum.accumulate: - # Restore NaTs that we masked previously - nz = (~np.asarray(mask)).nonzero()[0] - if len(nz): - # everything up to the first non-na entry stays NaT - result[: nz[0]] = iNaT + # transpose back for ndarray, not for EA + return result.T if hasattr(result, "T") else result - if self.ndim == 1: - # restore dt64tz dtype - d["dtype"] = self.dtype - - elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): - mask = isna(self) - np.putmask(y, mask, mask_a) - result = accum_func(y, axis) - np.putmask(result, mask, mask_b) - else: - result = accum_func(y, axis) + result = self._data.apply(na_accum_func) + d = self._construct_axes_dict() + d["copy"] = False return self._constructor(result, **d).__finalize__(self) return set_function_name(cum_func, name, cls) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index eb98bdc49f976..93e165ad3d71e 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1331,8 +1331,8 @@ def test_agg_cython_table(self, df, func, expected, axis): _get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ - ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])), - ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])), + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), ], ), ), @@ -1341,6 +1341,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index ad2cbff888b2e..2deeeb95d057d 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame): # fix issue cummax_xs = datetime_frame.cummax(axis=1) assert np.shape(cummax_xs) == np.shape(datetime_frame) + + def test_cumulative_ops_preserve_dtypes(self): + # GH#19296 dont incorrectly upcast to object + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]}) + + result = df.cumsum() + + expected = DataFrame( + { + "A": Series([1, 3, 6], dtype=np.int64), + "B": Series([1, 3, 6], dtype=np.float64), + "C": df["C"].cumsum(), + } + ) + tm.assert_frame_equal(result, expected)