Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: masked ops for reductions (sum) #30982

Merged
merged 24 commits into from
Mar 27, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9795f35
POC masked ops for reductions
jorisvandenbossche Jan 13, 2020
7fb1f88
Merge remote-tracking branch 'upstream/master' into masked-ops
jorisvandenbossche Feb 13, 2020
cd26920
fix mask for older numpy
jorisvandenbossche Feb 13, 2020
28cd331
also use in boolean
jorisvandenbossche Feb 13, 2020
6298fbd
add min_count support
jorisvandenbossche Feb 13, 2020
735a741
fix preserve_dtypes test
jorisvandenbossche Feb 13, 2020
6df454f
passthrough min_count for boolean as well
jorisvandenbossche Feb 13, 2020
5eb48d6
fix comment
jorisvandenbossche Feb 13, 2020
3ef1331
Merge remote-tracking branch 'upstream/master' into masked-ops
jorisvandenbossche Feb 20, 2020
d2230fd
add object to empty reduction test case
jorisvandenbossche Feb 20, 2020
19ac821
test platform int
jorisvandenbossche Feb 21, 2020
2776436
Test sum separately with platform int
jorisvandenbossche Feb 21, 2020
68b4dc2
Merge remote-tracking branch 'upstream/master' into masked-ops
jorisvandenbossche Mar 20, 2020
18d5bfa
share min_count checking helper function with nanops
jorisvandenbossche Mar 20, 2020
457efb1
Merge remote-tracking branch 'upstream/master' into masked-ops
jorisvandenbossche Mar 23, 2020
4df858f
type + add docstring for min_count
jorisvandenbossche Mar 23, 2020
f5120db
Merge remote-tracking branch 'upstream/master' into masked-ops
jorisvandenbossche Mar 25, 2020
76c5149
move sum algo from ops to array_algos
jorisvandenbossche Mar 25, 2020
476f768
Merge remote-tracking branch 'upstream/master' into masked-ops
jorisvandenbossche Mar 26, 2020
b2162dc
add Int64/boolean to some benchmarks
jorisvandenbossche Mar 26, 2020
d4746f5
add whatsnew
jorisvandenbossche Mar 26, 2020
d9c2cbf
add skipna default in function signature
jorisvandenbossche Mar 26, 2020
f8705c2
update type hint + deprivatize
jorisvandenbossche Mar 26, 2020
1a43e10
update another type hint
jorisvandenbossche Mar 26, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

from pandas.core import nanops, ops
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import mask_ops

from .masked import BaseMaskedArray

Expand Down Expand Up @@ -697,6 +698,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name == "sum":
return mask_ops.sum(data, mask, skipna=skipna, **kwargs)

# coerce to a nan-aware float if needed
if self._hasna:
data = self.to_numpy("float64", na_value=np.nan)
Expand All @@ -708,7 +712,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
return libmissing.NA

# if we have numeric op that would result in an int, coerce to int if possible
if name in ["sum", "prod"] and notna(result):
if name == "prod" and notna(result):
jreback marked this conversation as resolved.
Show resolved Hide resolved
int_result = np.int64(result)
if int_result == result:
result = int_result
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from pandas.core import nanops, ops
import pandas.core.common as com
from pandas.core.indexers import check_array_indexer
from pandas.core.ops import invalid_comparison
from pandas.core.ops import invalid_comparison, mask_ops
from pandas.core.ops.common import unpack_zerodim_and_defer
from pandas.core.tools.numeric import to_numeric

Expand Down Expand Up @@ -567,6 +567,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
data = self._data
mask = self._mask

if name == "sum":
return mask_ops.sum(data, mask, skipna=skipna, **kwargs)

# coerce to a nan-aware float if needed
# (we explicitly use NaN within reductions)
if self._hasna:
Expand All @@ -584,7 +587,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):

# if we have a preservable numeric op,
# provide coercion back to an integer type if possible
elif name in ["sum", "min", "max", "prod"]:
elif name in ["min", "max", "prod"]:
# GH#31409 more performant than casting-then-checking
result = com.cast_scalar_indexer(result)

Expand Down
53 changes: 53 additions & 0 deletions pandas/core/ops/mask_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np

from pandas._libs import lib, missing as libmissing
from pandas.compat.numpy import _np_version_under1p17


def kleene_or(
Expand Down Expand Up @@ -176,3 +177,55 @@ def kleene_and(
def raise_for_nan(value, method):
if lib.is_float(value) and np.isnan(value):
raise ValueError(f"Cannot perform logical '{method}' with floating NaN")


def sum(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this entire file is in a really weird place. we have pandas/core/nanops.py which this is duplicating lots of things, and the eventual home of pandas/core/array_algos/ where I think this should live (once merged with nanops).

I think this PR needs to move things in the right place, rather than having duplicated code. A pre-cursor PR to move things around would be ok too.

Having duplicated code is a huge drag and needs to be patched sooner rather than later. I believe we had this discussion originally when this location was selected (pre 1.0.0)

values: np.ndarray, mask: np.ndarray, skipna: bool, min_count: int = 0,
):
"""
Sum for 1D masked array.

Parameters
----------
values : np.ndarray
Numpy array with the values (can be of any dtype that support the
operation).
mask : np.ndarray
Boolean numpy array (False for missing)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does "False for missing" mean here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, not sure, as it should actually be "True for missing" if it is to explain that the True values in the mask are missing values (maybe I was first planning to pass an inversed mask).
Will update that.

skipna : bool, default True
Whether to skip NA.
min_count : int, default 0
The required number of valid values to perform the operation. If fewer than
``min_count`` non-NA values are present the result will be NA.
"""
if not skipna:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most of the code here is not sum-specific. is the idea to eventually make a template that gets re-used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's possible. I mainly want to implement first a single one, so we can review this / agree on the principles. Afterwards there can be more PRs implementing the other reductions, and then we should indeed see what's the best way to do this to avoid duplication.

if mask.any():
return libmissing.NA
else:
if _below_min_count(values, None, min_count):
return libmissing.NA
return np.sum(values)
else:
if _below_min_count(values, mask, min_count):
return libmissing.NA

if _np_version_under1p17:
return np.sum(values[~mask])
else:
return np.sum(values, where=~mask)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

out of curiosity, is this more efficient than the version used in older numpy?

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Feb 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. With the array in my top post it gives only a slight boost (a 5-8% speed-up), but it varies quite a bit (I could also have it slower with more np.nan values).
But when going to larger arrays, there seems to be a clearer difference:

In [40]: a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, np.nan]*1_000_000)   

In [41]: mask = np.isnan(a)  

In [42]: %timeit np.sum(a, where=~mask)    
17.7 ms ± 861 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [43]: %timeit np.sum(a[~mask])  
43.3 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Although the difference gets smaller when there are more missing values (eg with 50% missing values instead of 10% in the above, it becomes 25 vs 32ms).

But I assume there is also a memory benefit (avoiding the temporary array), although I am not fully familiar with the inner workings of this in numpy.



def _below_min_count(values, mask, min_count):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we already do this in the nanops sum yes?

can u de duplicate

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's here:

pandas/pandas/core/nanops.py

Lines 1238 to 1271 in 37a7006

def _maybe_null_out(
result: np.ndarray,
axis: Optional[int],
mask: Optional[np.ndarray],
shape: Tuple,
min_count: int = 1,
) -> float:
"""
Returns
-------
Dtype
The product of all elements on a given axis. ( NaNs are treated as 1)
"""
if mask is not None and axis is not None and getattr(result, "ndim", False):
null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
if np.any(null_mask):
if is_numeric_dtype(result):
if np.iscomplexobj(result):
result = result.astype("c16")
else:
result = result.astype("f8")
result[null_mask] = np.nan
else:
# GH12941, use None to auto cast null
result[null_mask] = None
elif result is not NaT:
if mask is not None:
null_mask = mask.size - mask.sum()
else:
null_mask = np.prod(shape)
if null_mask < min_count:
result = np.nan
return result

But I would prefer not to deduplicate, as the one in nanops is mixed with other logic / more complex because it needs to handle more dimensions.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

disagree this is more code to maintain which keeps happening with ios

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Feb 20, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, what's "with ios" ?

"""
Check for the `min_count` keyword. Returns True if below `min_count` (when
pd.NA should be returned from the reduction).
"""
if min_count > 0:
if mask is None:
# no missing values, only check size
non_nulls = values.size
else:
non_nulls = mask.size - mask.sum()
if non_nulls < min_count:
return True
return False
5 changes: 4 additions & 1 deletion pandas/tests/arrays/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,10 @@ def test_preserve_dtypes(op):

# op
result = getattr(df.C, op)()
assert isinstance(result, int)
if op == "sum":
assert isinstance(result, np.int64)
else:
assert isinstance(result, int)

# groupby
result = getattr(df.groupby("A"), op)()
Expand Down
29 changes: 18 additions & 11 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,13 +528,14 @@ def test_sum_inf(self):
res = nanops.nansum(arr, axis=1)
assert np.isinf(res).all()

@pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should object be included here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe None?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added object dtype

@pytest.mark.parametrize("use_bottleneck", [True, False])
@pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)])
def test_empty(self, method, unit, use_bottleneck):
def test_empty(self, method, unit, use_bottleneck, dtype):
with pd.option_context("use_bottleneck", use_bottleneck):
# GH#9422 / GH#18921
# Entirely empty
s = Series([], dtype=object)
s = Series([], dtype=dtype)
# NA by default
result = getattr(s, method)()
assert result == unit
Expand All @@ -557,8 +558,14 @@ def test_empty(self, method, unit, use_bottleneck):
result = getattr(s, method)(skipna=True, min_count=1)
assert pd.isna(result)

result = getattr(s, method)(skipna=False, min_count=0)
assert result == unit

result = getattr(s, method)(skipna=False, min_count=1)
assert pd.isna(result)

# All-NA
s = Series([np.nan])
s = Series([np.nan], dtype=dtype)
# NA by default
result = getattr(s, method)()
assert result == unit
Expand All @@ -582,7 +589,7 @@ def test_empty(self, method, unit, use_bottleneck):
assert pd.isna(result)

# Mix of valid, empty
s = Series([np.nan, 1])
s = Series([np.nan, 1], dtype=dtype)
# Default
result = getattr(s, method)()
assert result == 1.0
Expand All @@ -601,22 +608,22 @@ def test_empty(self, method, unit, use_bottleneck):
result = getattr(s, method)(skipna=True, min_count=0)
assert result == 1.0

result = getattr(s, method)(skipna=True, min_count=1)
assert result == 1.0

# GH#844 (changed in GH#9422)
df = DataFrame(np.empty((10, 0)))
df = DataFrame(np.empty((10, 0)), dtype=dtype)
assert (getattr(df, method)(1) == unit).all()

s = pd.Series([1])
s = pd.Series([1], dtype=dtype)
result = getattr(s, method)(min_count=2)
assert pd.isna(result)

s = pd.Series([np.nan])
result = getattr(s, method)(skipna=False, min_count=2)
assert pd.isna(result)

s = pd.Series([np.nan], dtype=dtype)
result = getattr(s, method)(min_count=2)
assert pd.isna(result)

s = pd.Series([np.nan, 1])
s = pd.Series([np.nan, 1], dtype=dtype)
result = getattr(s, method)(min_count=2)
assert pd.isna(result)

Expand Down