Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/PERF: enable column-wise reductions for EA-backed columns #32867

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a9ca0fa
ENH/PERF: enable column-wise reductions for EA-backed columns
jorisvandenbossche Mar 20, 2020
21aee0d
fix numeric_only for EAs
jorisvandenbossche Mar 20, 2020
9f83f6e
fix _reduce_columns call
jorisvandenbossche Mar 20, 2020
a9706e0
move EA._reduce call into blk_func
jorisvandenbossche Mar 20, 2020
07372e3
reuse blk_func for column-wise, inline _iter_arrays
jorisvandenbossche Mar 20, 2020
2d08450
temp
jorisvandenbossche Mar 20, 2020
c7f1dae
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Mar 27, 2020
9e2a780
first attempts of going block-wise with numeric_only=None
jorisvandenbossche Mar 27, 2020
7f847e8
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Apr 3, 2020
594d2b0
TEMP
jorisvandenbossche Apr 3, 2020
3d62e68
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Apr 24, 2020
5b0370e
use iter_column_arrays
jorisvandenbossche May 29, 2020
6e3c287
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche May 29, 2020
7088cfc
intermediate clean-up: remove BM.reduce changes + do column-wise for …
jorisvandenbossche May 29, 2020
925d660
fixup
jorisvandenbossche May 29, 2020
852331e
fix dtype of empty result
jorisvandenbossche May 30, 2020
1c9a685
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Jun 6, 2020
34731f2
clean-up
jorisvandenbossche Jun 6, 2020
a8e61d0
whitespace
jorisvandenbossche Jun 6, 2020
f233109
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Jun 15, 2020
ec65c57
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Jul 12, 2020
15ec9b6
add test case for GH34520, copied from GH35112
jorisvandenbossche Jul 12, 2020
2653d02
add test to ensure EA op is used for integer array
jorisvandenbossche Jul 12, 2020
64e0069
remove try except
jorisvandenbossche Jul 12, 2020
bb0a47b
remove unused code
jorisvandenbossche Jul 12, 2020
9323f0e
add test for GH32651, copied from GH34210
jorisvandenbossche Jul 12, 2020
eb33f86
remove check for EAs for block-wise path
jorisvandenbossche Jul 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1149,6 +1149,7 @@ ExtensionArray
- Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`)
- Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`)
- Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`).
- Fixed bug where DataFrame reductions with Int64 columns casts to float64 (:issue:`32651`)

Other
^^^^^
Expand Down
73 changes: 52 additions & 21 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8501,6 +8501,22 @@ def _count_level(self, level, axis=0, numeric_only=False):
def _reduce(
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
):
"""
Reduce DataFrame over axis with given operation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typing args a +

Parameters
----------
op : func
The reducing function to be called on the values.
name : str
The name of the reduction.
axis : int
numeric_only : bool, optional
filter_type : None or "bool"
Set to "bool" for ops that only work on boolean values.
skipna, **kwds : keywords to pass to the `op` function

"""

assert filter_type is None or filter_type == "bool", filter_type

Expand Down Expand Up @@ -8551,6 +8567,7 @@ def _get_data(axis_matters):
raise NotImplementedError(msg)
return data

# special case for block-wise
if numeric_only is not None and axis in [0, 1]:
df = self
if numeric_only is True:
Expand Down Expand Up @@ -8579,33 +8596,47 @@ def blk_func(values):
out[:] = coerce_to_dtypes(out.values, df.dtypes)
return out

if not self._is_homogeneous_type:
# try to avoid self.values call
def array_func(values):
if isinstance(values, ExtensionArray):
return values._reduce(name, skipna=skipna, **kwds)
else:
return op(values, skipna=skipna, **kwds)
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved

if filter_type is None and axis == 0 and len(self) > 0:
# operate column-wise
# all other options with axis=0 are done column-array-wise
if axis == 0:

# numeric_only must be None here, as other cases caught above
# require len(self) > 0 bc frame_apply messes up empty prod/sum
def _constructor(df, result, index=None):
index = index if index is not None else df.columns
if len(result):
return df._constructor_sliced(result, index=index)
else:
# set correct dtype for empty result
dtype = "bool" if filter_type == "bool" else "float64"
return df._constructor_sliced(result, index=index, dtype=dtype)

# this can end up with a non-reduction
# but not always. if the types are mixed
# with datelike then need to make sure a series
df = self
if numeric_only is True:
df = _get_data(axis_matters=True)

# we only end up here if we have not specified
# numeric_only and yet we have tried a
# column-by-column reduction, where we have mixed type.
# So let's just do what we can
from pandas.core.apply import frame_apply
if numeric_only is not None:
result = [array_func(arr) for arr in df._iter_column_arrays()]
return _constructor(df, result)
else:
# with numeric_only=None, need to ignore exceptions per column
result = []
indices = []
for i, arr in enumerate(df._iter_column_arrays()):
try:
res = array_func(arr)
except Exception:
pass
else:
result.append(res)
indices.append(i)

opa = frame_apply(
self, func=f, result_type="expand", ignore_failures=True
)
result = opa.get_result()
if result.ndim == self.ndim:
result = result.iloc[0].rename(None)
return result
return _constructor(df, result, index=df.columns[indices])

# remaining cases for axis=1 or axis=None
if numeric_only is None:
data = self
values = data.values
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11402,7 +11402,7 @@ def stat_func(
if level is not None:
return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
return self._reduce(
func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only,
)

return set_function_name(stat_func, name, cls)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
assert result == expected


@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
def test_dataframe_reductions(op):
# https://github.com/pandas-dev/pandas/pull/32867
# ensure the integers are not cast to float during reductions
df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
result = df.max()
assert isinstance(result["a"], np.int64)


# TODO(jreback) - these need testing / are broken

# shift
Expand Down
25 changes: 24 additions & 1 deletion pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ def test_stat_operators_attempt_obj_array(self, method):
for df in [df1, df2]:
assert df.values.dtype == np.object_
result = getattr(df, method)(1)
expected = getattr(df.astype("f8"), method)(1)
expected = getattr(df, method)(1)

if method in ["sum", "prod"]:
tm.assert_series_equal(result, expected)
Expand Down Expand Up @@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method):
df = DataFrame([expected])
result = getattr(df, method)(axis=1)
tm.assert_series_equal(result, expected)


def test_mixed_frame_with_integer_sum():
# https://github.com/pandas-dev/pandas/issues/34520
df = pd.DataFrame([["a", 1]], columns=list("ab"))
df = df.astype({"b": "Int64"})
result = df.sum()
expected = pd.Series(["a", 1], index=["a", "b"])
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("numeric_only", [True, False, None])
@pytest.mark.parametrize("method", ["min", "max"])
def test_minmax_extensionarray(method, numeric_only):
# https://github.com/pandas-dev/pandas/issues/32651
int64_info = np.iinfo("int64")
ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
df = DataFrame({"Int64": ser})
result = getattr(df, method)(numeric_only=numeric_only)
expected = Series(
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
)
tm.assert_series_equal(result, expected)