Skip to content

Commit

Permalink
REF: implement cumulative ops block-wise (#29872)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and jreback committed Dec 30, 2019
1 parent a6a8440 commit db062da
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 37 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,7 @@ Numeric
- Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`)
- Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`)
- Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`)
- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`)

Conversion
^^^^^^^^^^
Expand Down
92 changes: 57 additions & 35 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11086,44 +11086,66 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs):
else:
axis = self._get_axis_number(axis)

y = com.values_from_object(self).copy()
d = self._construct_axes_dict()
d["copy"] = False
if axis == 1:
return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T

def na_accum_func(blk_values):
# We will be applying this function to block values
if blk_values.dtype.kind in ["m", "M"]:
# numpy 1.18 started sorting NaTs at the end instead of beginning,
# so we need to work around to maintain backwards-consistency.
orig_dtype = blk_values.dtype

# We need to define mask before masking NaTs
mask = isna(blk_values)

if accum_func == np.minimum.accumulate:
# Note: the accum_func comparison fails as an "is" comparison
y = blk_values.view("i8")
y[mask] = np.iinfo(np.int64).max
changed = True
else:
y = blk_values
changed = False

result = accum_func(y.view("i8"), axis)
if skipna:
np.putmask(result, mask, iNaT)
elif accum_func == np.minimum.accumulate:
# Restore NaTs that we masked previously
nz = (~np.asarray(mask)).nonzero()[0]
if len(nz):
# everything up to the first non-na entry stays NaT
result[: nz[0]] = iNaT

if changed:
# restore NaT elements
y[mask] = iNaT # TODO: could try/finally for this?

if isinstance(blk_values, np.ndarray):
result = result.view(orig_dtype)
else:
# DatetimeArray
result = type(blk_values)._from_sequence(result, dtype=orig_dtype)

elif skipna and not issubclass(
blk_values.dtype.type, (np.integer, np.bool_)
):
vals = blk_values.copy().T
mask = isna(vals)
np.putmask(vals, mask, mask_a)
result = accum_func(vals, axis)
np.putmask(result, mask, mask_b)
else:
result = accum_func(blk_values.T, axis)

if issubclass(y.dtype.type, (np.datetime64, np.timedelta64)):
# numpy 1.18 started sorting NaTs at the end instead of beginning,
# so we need to work around to maintain backwards-consistency.
orig_dtype = y.dtype
if accum_func == np.minimum.accumulate:
# Note: the accum_func comparison fails as an "is" comparison
# Note that "y" is always a copy, so we can safely modify it
mask = isna(self)
y = y.view("i8")
y[mask] = np.iinfo(np.int64).max

result = accum_func(y.view("i8"), axis).view(orig_dtype)
if skipna:
mask = isna(self)
np.putmask(result, mask, iNaT)
elif accum_func == np.minimum.accumulate:
# Restore NaTs that we masked previously
nz = (~np.asarray(mask)).nonzero()[0]
if len(nz):
# everything up to the first non-na entry stays NaT
result[: nz[0]] = iNaT
# transpose back for ndarray, not for EA
return result.T if hasattr(result, "T") else result

if self.ndim == 1:
# restore dt64tz dtype
d["dtype"] = self.dtype

elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)):
mask = isna(self)
np.putmask(y, mask, mask_a)
result = accum_func(y, axis)
np.putmask(result, mask, mask_b)
else:
result = accum_func(y, axis)
result = self._data.apply(na_accum_func)

d = self._construct_axes_dict()
d["copy"] = False
return self._constructor(result, **d).__finalize__(self)

return set_function_name(cum_func, name, cls)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/frame/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1331,8 +1331,8 @@ def test_agg_cython_table(self, df, func, expected, axis):
_get_cython_table_params(
DataFrame([[np.nan, 1], [1, 2]]),
[
("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])),
("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])),
("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
],
),
),
Expand All @@ -1341,6 +1341,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis):
# GH 21224
# test transforming functions in
# pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
if axis == "columns" or axis == 1:
# operating blockwise doesn't let us preserve dtypes
expected = expected.astype("float64")

result = df.agg(func, axis=axis)
tm.assert_frame_equal(result, expected)

Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/frame/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame):
# fix issue
cummax_xs = datetime_frame.cummax(axis=1)
assert np.shape(cummax_xs) == np.shape(datetime_frame)

def test_cumulative_ops_preserve_dtypes(self):
# GH#19296 dont incorrectly upcast to object
df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]})

result = df.cumsum()

expected = DataFrame(
{
"A": Series([1, 3, 6], dtype=np.int64),
"B": Series([1, 3, 6], dtype=np.float64),
"C": df["C"].cumsum(),
}
)
tm.assert_frame_equal(result, expected)

0 comments on commit db062da

Please sign in to comment.