Skip to content

Commit

Permalink
PERF: avoid creating many Series in apply_standard (#34909)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Jun 25, 2020
1 parent 370ddc6 commit 91802a9
Showing 1 changed file with 48 additions and 65 deletions.
113 changes: 48 additions & 65 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,13 @@

import numpy as np

from pandas._config import option_context

from pandas._libs import reduction as libreduction
from pandas._typing import Axis
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import (
is_dict_like,
is_extension_array_dtype,
is_list_like,
is_sequence,
)
from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence
from pandas.core.dtypes.generic import ABCSeries

from pandas.core.construction import create_series_with_explicit_dtype
Expand Down Expand Up @@ -260,53 +257,6 @@ def apply_standard(self):
# partial result that may be returned from reduction
partial_result = None

# try to reduce first (by default)
# this only matters if the reduction in values is of different dtype
# e.g. if we want to apply to a SparseFrame, then can't directly reduce

# we cannot reduce using non-numpy dtypes,
# as demonstrated in gh-12244
if (
self.result_type in ["reduce", None]
and not self.dtypes.apply(is_extension_array_dtype).any()
# Disallow dtypes where setting _index_data will break
# ExtensionArray values, see GH#31182
and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any()
# Disallow complex_internals since libreduction shortcut raises a TypeError
and not self.agg_axis._has_complex_internals
):

values = self.values
index = self.obj._get_axis(self.axis)
labels = self.agg_axis
empty_arr = np.empty(len(index), dtype=values.dtype)

# Preserve subclass for e.g. test_subclassed_apply
dummy = self.obj._constructor_sliced(
empty_arr, index=index, dtype=values.dtype
)

try:
result, reduction_success = libreduction.compute_reduction(
values, self.f, axis=self.axis, dummy=dummy, labels=labels
)
except TypeError:
# e.g. test_apply_ignore_failures we just ignore
if not self.ignore_failures:
raise
except ZeroDivisionError:
# reached via numexpr; fall back to python implementation
pass
else:
if reduction_success:
return self.obj._constructor_sliced(result, index=labels)

# no exceptions - however reduction was unsuccessful,
# use the computed function result for first element
partial_result = result[0]
if isinstance(partial_result, ABCSeries):
partial_result = partial_result.infer_objects()

# compute the result using the series generator,
# use the result computed while trying to reduce if available.
results, res_index = self.apply_series_generator(partial_result)
Expand Down Expand Up @@ -344,7 +294,14 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]
else:
for i, v in series_gen_enumeration:

results[i] = self.f(v)
with option_context("mode.chained_assignment", None):
# ignore SettingWithCopy here in case the user mutates
results[i] = self.f(v)

if isinstance(results[i], ABCSeries):
# If we have a view on v, we need to make a copy because
# series_generator will swap out the underlying data
results[i] = results[i].copy(deep=False)

return results, res_index

Expand All @@ -355,7 +312,6 @@ def wrap_results(

# see if we can infer the results
if len(results) > 0 and 0 in results and is_sequence(results[0]):

return self.wrap_results_for_axis(results, res_index)

# dict of scalars
Expand Down Expand Up @@ -395,9 +351,30 @@ def result_columns(self) -> "Index":

def wrap_results_for_axis(
self, results: ResType, res_index: "Index"
) -> "DataFrame":
) -> Union["Series", "DataFrame"]:
""" return the results for the rows """
result = self.obj._constructor(data=results)

if self.result_type == "reduce":
# e.g. test_apply_dict GH#8735
return self.obj._constructor_sliced(results)
elif self.result_type is None and all(
isinstance(x, dict) for x in results.values()
):
# Our operation was a to_dict op e.g.
# test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196
return self.obj._constructor_sliced(results)

try:
result = self.obj._constructor(data=results)
except ValueError as err:
if "arrays must all be same length" in str(err):
# e.g. result = [[2, 3], [1.5], ['foo', 'bar']]
# see test_agg_listlike_result GH#29587
res = self.obj._constructor_sliced(results)
res.index = res_index
return res
else:
raise

if not isinstance(results[0], ABCSeries):
if len(result.index) == len(self.res_columns):
Expand All @@ -418,11 +395,19 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame":

@property
def series_generator(self):
constructor = self.obj._constructor_sliced
return (
constructor(arr, index=self.columns, name=name)
for i, (arr, name) in enumerate(zip(self.values, self.index))
)
values = self.values
assert len(values) > 0

# We create one Series object, and will swap out the data inside
# of it. Kids: don't do this at home.
ser = self.obj._ixs(0, axis=0)
mgr = ser._mgr
blk = mgr.blocks[0]

for (arr, name) in zip(values, self.index):
blk.values = arr
ser.name = name
yield ser

@property
def result_index(self) -> "Index":
Expand All @@ -444,9 +429,7 @@ def wrap_results_for_axis(

# we have a non-series and don't want inference
elif not isinstance(results[0], ABCSeries):
from pandas import Series

result = Series(results)
result = self.obj._constructor_sliced(results)
result.index = res_index

# we may want to infer results
Expand Down

0 comments on commit 91802a9

Please sign in to comment.