Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/PERF: enable column-wise reductions for EA-backed columns #32867

Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a9ca0fa
ENH/PERF: enable column-wise reductions for EA-backed columns
jorisvandenbossche Mar 20, 2020
21aee0d
fix numeric_only for EAs
jorisvandenbossche Mar 20, 2020
9f83f6e
fix _reduce_columns call
jorisvandenbossche Mar 20, 2020
a9706e0
move EA._reduce call into blk_func
jorisvandenbossche Mar 20, 2020
07372e3
reuse blk_func for column-wise, inline _iter_arrays
jorisvandenbossche Mar 20, 2020
2d08450
temp
jorisvandenbossche Mar 20, 2020
c7f1dae
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Mar 27, 2020
9e2a780
first attempts of going block-wise with numeric_only=None
jorisvandenbossche Mar 27, 2020
7f847e8
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Apr 3, 2020
594d2b0
TEMP
jorisvandenbossche Apr 3, 2020
3d62e68
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Apr 24, 2020
5b0370e
use iter_column_arrays
jorisvandenbossche May 29, 2020
6e3c287
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche May 29, 2020
7088cfc
intermediate clean-up: remove BM.reduce changes + do column-wise for …
jorisvandenbossche May 29, 2020
925d660
fixup
jorisvandenbossche May 29, 2020
852331e
fix dtype of empty result
jorisvandenbossche May 30, 2020
1c9a685
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Jun 6, 2020
34731f2
clean-up
jorisvandenbossche Jun 6, 2020
a8e61d0
whitespace
jorisvandenbossche Jun 6, 2020
f233109
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Jun 15, 2020
ec65c57
Merge remote-tracking branch 'upstream/master' into EA-column-wise-re…
jorisvandenbossche Jul 12, 2020
15ec9b6
add test case for GH34520, copied from GH35112
jorisvandenbossche Jul 12, 2020
2653d02
add test to ensure EA op is used for integer array
jorisvandenbossche Jul 12, 2020
64e0069
remove try except
jorisvandenbossche Jul 12, 2020
bb0a47b
remove unused code
jorisvandenbossche Jul 12, 2020
9323f0e
add test for GH32651, copied from GH34210
jorisvandenbossche Jul 12, 2020
eb33f86
remove check for EAs for block-wise path
jorisvandenbossche Jul 12, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7852,6 +7852,23 @@ def _count_level(self, level, axis=0, numeric_only=False):
def _reduce(
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds
):
"""
Reduce DataFrame over axis with given operation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typing args a +

Parameters
----------
op : func
The reducing function to be called on the values.
name : str
The name of the reduction.
axis : int
numeric_only : bool, optional
filter_type : None or "bool"
Set to "bool" for ops that give boolean results.
skipna, **kwds : keywords to pass to the `op` function

"""
column_wise = kwds.pop("column_wise", False)
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved

assert filter_type is None or filter_type == "bool", filter_type

Expand Down Expand Up @@ -7898,6 +7915,13 @@ def _get_data(axis_matters):
raise NotImplementedError(msg)
return data

if axis == 0 and column_wise:
# column-wise reduction
df = self
if numeric_only is True:
df = _get_data(axis_matters=True)
return df._reduce_columns(op, name, skipna=skipna, **kwds)

if numeric_only is not None and axis in [0, 1]:
df = self
if numeric_only is True:
Expand All @@ -7916,7 +7940,7 @@ def blk_func(values):

# After possibly _get_data and transposing, we are now in the
# simple case where we can use BlockManager._reduce
res = df._data.reduce(blk_func)
res = df._data.reduce(blk_func, name, skipna, **kwds)
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(res, dict)
if len(res):
assert len(res) == max(list(res.keys())) + 1, res.keys()
Expand Down Expand Up @@ -7994,6 +8018,47 @@ def blk_func(values):
result = self._constructor_sliced(result, index=labels)
return result

def _reduce_columns(self, op, name, skipna=True, **kwds):
"""
Reduce DataFrame column-wise.

Parameters
----------
op : func
The reducing function to be called on the values. Only used
for columns backed by a numpy ndarray.
name : str
The name of the reduction.
skipna, **kwds : keywords to pass to the `op` function

Returns
-------
Series
"""
result = []

for arr in self._iter_arrays():
if isinstance(arr, ExtensionArray):
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
# dispatch to ExtensionArray interface
val = arr._reduce(name, skipna=skipna, **kwds)
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
else:
# dispatch to numpy arrays
with np.errstate(all="ignore"):
val = op(arr, skipna=skipna, **kwds)

result.append(val)

return self._constructor_sliced(result, index=self.columns)

def _iter_arrays(self):
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
"""
Iterate over the arrays of all columns in order.

This returns the values as stored in the Block (ndarray or ExtensionArray).
"""
for i in range(len(self.columns)):
yield self._data.iget_values(i)

def nunique(self, axis=0, dropna=True) -> Series:
"""
Count distinct observations over requested axis.
Expand Down
10 changes: 9 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11067,6 +11067,7 @@ def stat_func(
min_count=0,
**kwargs,
):
column_wise = kwargs.pop("column_wise", False)
if name == "sum":
nv.validate_sum(tuple(), kwargs)
elif name == "prod":
Expand All @@ -11088,6 +11089,7 @@ def stat_func(
skipna=skipna,
numeric_only=numeric_only,
min_count=min_count,
column_wise=column_wise,
)

return set_function_name(stat_func, name, cls)
Expand Down Expand Up @@ -11117,6 +11119,7 @@ def _make_stat_function(
def stat_func(
self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs
):
column_wise = kwargs.pop("column_wise", False)
if name == "median":
nv.validate_median(tuple(), kwargs)
else:
Expand All @@ -11128,7 +11131,12 @@ def stat_func(
if level is not None:
return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
return self._reduce(
func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
func,
name=name,
axis=axis,
skipna=skipna,
numeric_only=numeric_only,
column_wise=column_wise,
)

return set_function_name(stat_func, name, cls)
Expand Down
19 changes: 15 additions & 4 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,24 +349,27 @@ def _verify_integrity(self) -> None:
f"tot_items: {tot_items}"
)

def reduce(self, func, *args, **kwargs):
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
def reduce(self, func, name, skipna=True, **kwds):
# If 2D, we assume that we're operating column-wise
if self.ndim == 1:
# we'll be returning a scalar
blk = self.blocks[0]
return func(blk.values, *args, **kwargs)
return func(blk.values)

res = {}
for blk in self.blocks:
bres = func(blk.values, *args, **kwargs)
if isinstance(blk, ExtensionBlock):
bres = blk.values._reduce(name, skipna=skipna, **kwds)
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
else:
bres = func(blk.values)

if np.ndim(bres) == 0:
# EA
assert blk.shape[0] == 1
new_res = zip(blk.mgr_locs.as_array, [bres])
else:
assert bres.ndim == 1, bres.shape
assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs)
assert blk.shape[0] == len(bres), (blk.shape, bres.shape)
new_res = zip(blk.mgr_locs.as_array, bres)

nr = dict(new_res)
Expand Down Expand Up @@ -998,6 +1001,14 @@ def iget(self, i: int) -> "SingleBlockManager":
fastpath=True,
)

def iget_values(self, i: int):
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
"""
Return the data for column i as the values (ndarray or ExtensionArray).
"""
block = self.blocks[self.blknos[i]]
values = block.iget(self.blklocs[i])
return values

def delete(self, item):
"""
Delete selected item (items if non-unique) in-place.
Expand Down
1 change: 1 addition & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3871,6 +3871,7 @@ def _reduce(
If we have an ndarray as a value, then simply perform the operation,
otherwise delegate to the object.
"""
kwds.pop("column_wise", None)
delegate = self._values

if axis is not None:
Expand Down