Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: implement cumulative ops block-wise #29872

Merged
merged 10 commits into from
Dec 30, 2019
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,7 @@ Numeric
- Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`)
- Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)

Conversion
^^^^^^^^^^
Expand Down
92 changes: 57 additions & 35 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11086,44 +11086,66 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs):
else:
axis = self._get_axis_number(axis)

y = com.values_from_object(self).copy()
d = self._construct_axes_dict()
d["copy"] = False
if axis == 1:
return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T
jreback marked this conversation as resolved.
Show resolved Hide resolved

def na_accum_func(blk_values):
jreback marked this conversation as resolved.
Show resolved Hide resolved
# We will be applying this function to block values
if blk_values.dtype.kind in ["m", "M"]:
# numpy 1.18 started sorting NaTs at the end instead of beginning,
# so we need to work around to maintain backwards-consistency.
orig_dtype = blk_values.dtype

# We need to define mask before masking NaTs
mask = isna(blk_values)

if accum_func == np.minimum.accumulate:
# Note: the accum_func comparison fails as an "is" comparison
y = blk_values.view("i8")
y[mask] = np.iinfo(np.int64).max
changed = True
else:
y = blk_values
changed = False

result = accum_func(y.view("i8"), axis)
if skipna:
np.putmask(result, mask, iNaT)
elif accum_func == np.minimum.accumulate:
# Restore NaTs that we masked previously
nz = (~np.asarray(mask)).nonzero()[0]
if len(nz):
# everything up to the first non-na entry stays NaT
result[: nz[0]] = iNaT

if changed:
# restore NaT elements
y[mask] = iNaT # TODO: could try/finally for this?

if isinstance(blk_values, np.ndarray):
result = result.view(orig_dtype)
else:
# DatetimeArray
result = type(blk_values)._from_sequence(result, dtype=orig_dtype)

elif skipna and not issubclass(
blk_values.dtype.type, (np.integer, np.bool_)
):
vals = blk_values.copy().T
mask = isna(vals)
np.putmask(vals, mask, mask_a)
result = accum_func(vals, axis)
np.putmask(result, mask, mask_b)
else:
result = accum_func(blk_values.T, axis)

if issubclass(y.dtype.type, (np.datetime64, np.timedelta64)):
# numpy 1.18 started sorting NaTs at the end instead of beginning,
# so we need to work around to maintain backwards-consistency.
orig_dtype = y.dtype
if accum_func == np.minimum.accumulate:
# Note: the accum_func comparison fails as an "is" comparison
# Note that "y" is always a copy, so we can safely modify it
mask = isna(self)
y = y.view("i8")
y[mask] = np.iinfo(np.int64).max

result = accum_func(y.view("i8"), axis).view(orig_dtype)
if skipna:
mask = isna(self)
np.putmask(result, mask, iNaT)
elif accum_func == np.minimum.accumulate:
# Restore NaTs that we masked previously
nz = (~np.asarray(mask)).nonzero()[0]
if len(nz):
# everything up to the first non-na entry stays NaT
result[: nz[0]] = iNaT
# transpose back for ndarray, not for EA
return result.T if hasattr(result, "T") else result

if self.ndim == 1:
# restore dt64tz dtype
d["dtype"] = self.dtype

elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)):
mask = isna(self)
np.putmask(y, mask, mask_a)
result = accum_func(y, axis)
np.putmask(result, mask, mask_b)
else:
result = accum_func(y, axis)
result = self._data.apply(na_accum_func)

d = self._construct_axes_dict()
d["copy"] = False
return self._constructor(result, **d).__finalize__(self)

return set_function_name(cum_func, name, cls)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/frame/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,8 +1331,8 @@ def test_agg_cython_table(self, df, func, expected, axis):
_get_cython_table_params(
DataFrame([[np.nan, 1], [1, 2]]),
[
("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])),
("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])),
("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
],
),
),
Expand All @@ -1341,6 +1341,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis):
# GH 21224
# test transforming functions in
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
if axis == "columns" or axis == 1:
# operating blockwise doesn't let us preserve dtypes
expected = expected.astype("float64")

result = df.agg(func, axis=axis)
tm.assert_frame_equal(result, expected)

Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/frame/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame):
# fix issue
cummax_xs = datetime_frame.cummax(axis=1)
assert np.shape(cummax_xs) == np.shape(datetime_frame)

def test_cumulative_ops_preserve_dtypes(self):
# GH#19296 dont incorrectly upcast to object
df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]})

result = df.cumsum()

expected = DataFrame(
{
"A": Series([1, 3, 6], dtype=np.int64),
"B": Series([1, 3, 6], dtype=np.float64),
"C": df["C"].cumsum(),
}
)
tm.assert_frame_equal(result, expected)