From 9795f353bc8654803797a50965357aaefb93b97e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 13 Jan 2020 21:29:44 +0100 Subject: [PATCH 01/18] POC masked ops for reductions --- pandas/core/arrays/integer.py | 5 ++++- pandas/core/ops/mask_ops.py | 29 +++++++++++++++++++++++++++++ pandas/tests/arrays/test_integer.py | 2 +- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index cb1e7115cd3c2..dad00b4322696 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -24,7 +24,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops -from pandas.core.ops import invalid_comparison +from pandas.core.ops import invalid_comparison, mask_ops from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -548,6 +548,9 @@ def _reduce(self, name, skipna=True, **kwargs): data = self._data mask = self._mask + if name == "sum": + return mask_ops.sum(data, mask, skipna=skipna) + # coerce to a nan-aware float if needed if mask.any(): data = self._data.astype("float64") diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 8fb81faf313d7..fc204a29bdda2 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -6,6 +6,7 @@ import numpy as np from pandas._libs import lib, missing as libmissing +from pandas.compat.numpy import _np_version_under1p17 def kleene_or( @@ -176,3 +177,31 @@ def kleene_and( def raise_for_nan(value, method): if lib.is_float(value) and np.isnan(value): raise ValueError(f"Cannot perform logical '{method}' with floating NaN") + + +def sum( + values: np.ndarray, mask: np.ndarray, skipna: bool, +): + """ + Sum for 1D masked array. + + Parameters + ---------- + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (False for missing) + skipna: bool, default True + Whether to skip NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA + else: + return np.sum(values) + else: + if _np_version_under1p17: + return np.sum(values[mask]) + else: + return np.sum(values, where=~mask) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index f1a7cc741603d..9bfaa3b433fce 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -887,7 +887,7 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - assert isinstance(result, int) + assert isinstance(result, np.int64) # groupby result = getattr(df.groupby("A"), op)() From cd2692055521343479d275f5db8912eade6d1b5a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Feb 2020 11:05:52 +0100 Subject: [PATCH 02/18] fix mask for older numpy --- pandas/core/ops/mask_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index fc204a29bdda2..c42f99e9a912a 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -202,6 +202,6 @@ def sum( return np.sum(values) else: if _np_version_under1p17: - return np.sum(values[mask]) + return np.sum(values[~mask]) else: return np.sum(values, where=~mask) From 28cd331e4d0e4384850b58d9c700c5779d1a6a3c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Feb 2020 11:10:12 +0100 Subject: [PATCH 03/18] also use in boolean --- pandas/core/arrays/boolean.py | 6 +++++- pandas/core/arrays/integer.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 590b40b0434e5..7ebcbd5fe8417 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -28,6 +28,7 @@ from pandas.core import nanops, ops from pandas.core.indexers import check_array_indexer +from pandas.core.ops import mask_ops from .masked import BaseMaskedArray @@ -697,6 +698,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): data = self._data mask = self._mask + if name == "sum": + return mask_ops.sum(data, mask, skipna=skipna) + # coerce to a nan-aware float if needed if self._hasna: data = self.to_numpy("float64", na_value=np.nan) @@ -708,7 +712,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): return libmissing.NA # if we have numeric op that would result in an int, coerce to int if possible - if name in ["sum", "prod"] and notna(result): + if name == "prod" and notna(result): int_result = np.int64(result) if int_result == result: result = int_result diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3064566a25751..2d3da458f26a5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -587,7 +587,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["sum", "min", "max", "prod"]: + elif name in ["min", "max", "prod"]: # GH#31409 more performant than casting-then-checking result = com.cast_scalar_indexer(result) From 6298fbd0d136c80cd8788e9094274b31c69eb885 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Feb 2020 12:01:30 +0100 Subject: [PATCH 04/18] add min_count support --- pandas/core/arrays/integer.py | 2 +- pandas/core/ops/mask_ops.py | 28 ++++++++++++++++++++-- pandas/tests/reductions/test_reductions.py | 27 +++++++++++++-------- 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 2d3da458f26a5..7565fd86337c7 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -568,7 +568,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): mask = self._mask if name == "sum": - return mask_ops.sum(data, mask, skipna=skipna) + return mask_ops.sum(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index c42f99e9a912a..53a12019ad167 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -180,7 +180,7 @@ def raise_for_nan(value, method): def sum( - values: np.ndarray, mask: np.ndarray, skipna: bool, + values: np.ndarray, mask: np.ndarray, skipna: bool, min_count: int = 0, ): """ Sum for 1D masked array. @@ -192,16 +192,40 @@ def sum( operation). mask : np.ndarray Boolean numpy array (False for missing) - skipna: bool, default True + skipna : bool, default True Whether to skip NA. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. """ if not skipna: if mask.any(): return libmissing.NA else: + if _below_min_count(values, None, min_count): + return libmissing.NA return np.sum(values) else: + if _below_min_count(values, mask, min_count): + return libmissing.NA + if _np_version_under1p17: return np.sum(values[~mask]) else: return np.sum(values, where=~mask) + + +def _below_min_count(values, mask, min_count): + """ + Check for the `min_count` keyword. Returns True if below `min_count` (when + pd.NA should be returned from the reduction). + """ + if min_count > 0: + if mask is None: + # no missing values, only check size + non_nulls = values.size + else: + non_nulls = mask.size - mask.sum() + if non_nulls < min_count: + return True + return False diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 0b312fe2f8990..0815e0cdb441c 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -528,13 +528,14 @@ def test_sum_inf(self): res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() + @pytest.mark.parametrize("dtype", ["float64", "Int64"]) @pytest.mark.parametrize("use_bottleneck", [True, False]) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) - def test_empty(self, method, unit, use_bottleneck): + def test_empty(self, method, unit, use_bottleneck, dtype): with pd.option_context("use_bottleneck", use_bottleneck): # GH#9422 / GH#18921 # Entirely empty - s = Series([], dtype=object) + s = Series([], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -557,8 +558,14 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=1) assert pd.isna(result) + result = getattr(s, method)(skipna=False, min_count=0) + assert result == unit + + result = getattr(s, method)(skipna=False, min_count=1) + assert pd.isna(result) + # All-NA - s = Series([np.nan]) + s = Series([np.nan], dtype=dtype) # NA by default result = getattr(s, method)() assert result == unit @@ -582,7 +589,7 @@ def test_empty(self, method, unit, use_bottleneck): assert pd.isna(result) # Mix of valid, empty - s = Series([np.nan, 1]) + s = Series([np.nan, 1], dtype=dtype) # Default result = getattr(s, method)() assert result == 1.0 @@ -601,22 +608,22 @@ def test_empty(self, method, unit, use_bottleneck): result = getattr(s, method)(skipna=True, min_count=0) assert result == 1.0 - result = getattr(s, method)(skipna=True, min_count=1) - assert result == 1.0 - # GH#844 (changed in GH#9422) df = DataFrame(np.empty((10, 0))) assert (getattr(df, method)(1) == unit).all() - s = pd.Series([1]) + s = pd.Series([1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan]) + result = getattr(s, method)(skipna=False, min_count=2) + assert pd.isna(result) + + s = pd.Series([np.nan], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) - s = pd.Series([np.nan, 1]) + s = pd.Series([np.nan, 1], dtype=dtype) result = getattr(s, method)(min_count=2) assert pd.isna(result) From 735a741c33b72ea5a97bc3c719f74d052c8106dd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Feb 2020 12:03:56 +0100 Subject: [PATCH 05/18] fix preserve_dtypes test --- pandas/tests/arrays/test_integer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index da9a74296a486..e34da6e45afc3 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -911,7 +911,10 @@ def test_preserve_dtypes(op): # op result = getattr(df.C, op)() - assert isinstance(result, np.int64) + if op == "sum": + assert isinstance(result, np.int64) + else: + assert isinstance(result, int) # groupby result = getattr(df.groupby("A"), op)() From 6df454fc16b1e38fd336bf71e71015a545b82774 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Feb 2020 12:06:34 +0100 Subject: [PATCH 06/18] passthrough min_count for boolean as well --- pandas/core/arrays/boolean.py | 2 +- pandas/tests/reductions/test_reductions.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 7ebcbd5fe8417..12cc1f9faacbd 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -699,7 +699,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): mask = self._mask if name == "sum": - return mask_ops.sum(data, mask, skipna=skipna) + return mask_ops.sum(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed if self._hasna: diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 0815e0cdb441c..7596b88309592 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -528,7 +528,7 @@ def test_sum_inf(self): res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() - @pytest.mark.parametrize("dtype", ["float64", "Int64"]) + @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean"]) @pytest.mark.parametrize("use_bottleneck", [True, False]) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty(self, method, unit, use_bottleneck, dtype): @@ -609,7 +609,7 @@ def test_empty(self, method, unit, use_bottleneck, dtype): assert result == 1.0 # GH#844 (changed in GH#9422) - df = DataFrame(np.empty((10, 0))) + df = DataFrame(np.empty((10, 0)), dtype=dtype) assert (getattr(df, method)(1) == unit).all() s = pd.Series([1], dtype=dtype) From 5eb48d6d6a19d250c792dec53530aaef1998c90d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 13 Feb 2020 23:16:16 +0100 Subject: [PATCH 07/18] fix comment --- pandas/core/ops/mask_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 53a12019ad167..5e760e3d6da93 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -191,7 +191,7 @@ def sum( Numpy array with the values (can be of any dtype that support the operation). mask : np.ndarray - Boolean numpy array (False for missing) + Boolean numpy array (True values indicate missing values). skipna : bool, default True Whether to skip NA. min_count : int, default 0 From d2230fd6d3c106ba5c672071c0d671931ddd70e3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 20 Feb 2020 16:51:28 +0100 Subject: [PATCH 08/18] add object to empty reduction test case --- pandas/tests/reductions/test_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index bf353bdf5621d..562831a912f3b 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -533,7 +533,7 @@ def test_sum_inf(self): res = nanops.nansum(arr, axis=1) assert np.isinf(res).all() - @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean"]) + @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean", "object"]) @pytest.mark.parametrize("use_bottleneck", [True, False]) @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)]) def test_empty(self, method, unit, use_bottleneck, dtype): From 19ac821644ea857ee98685459e458411850ebe90 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 21 Feb 2020 09:55:45 +0100 Subject: [PATCH 09/18] test platform int --- pandas/tests/arrays/test_boolean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index d14d6f3ff0c41..58deaa32290c0 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -827,7 +827,7 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): s = s.dropna() if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.int64) + assert isinstance(getattr(s, op)(), np.intp) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) else: From 277643694c4bbeed8bd0df2a29999f8c6c8facbb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 21 Feb 2020 10:37:34 +0100 Subject: [PATCH 10/18] Test sum separately with platform int --- pandas/tests/arrays/test_boolean.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index 58deaa32290c0..0872630b6e979 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -826,8 +826,10 @@ def test_reductions_return_types(dropna, data, all_numeric_reductions): if dropna: s = s.dropna() - if op in ("sum", "prod"): - assert isinstance(getattr(s, op)(), np.intp) + if op == "sum": + assert isinstance(getattr(s, op)(), np.int_) + elif op == "prod": + assert isinstance(getattr(s, op)(), np.int64) elif op in ("min", "max"): assert isinstance(getattr(s, op)(), np.bool_) else: From 18d5bfa7a4fe35cf3d1efeb1a8030dcfef8d19dc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 Mar 2020 20:26:21 +0100 Subject: [PATCH 11/18] share min_count checking helper function with nanops --- pandas/core/nanops.py | 22 +++++++++++++++++----- pandas/core/ops/mask_ops.py | 22 ++++------------------ 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a5e70bd279d21..5b9d2509a3f22 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1260,16 +1260,28 @@ def _maybe_null_out( # GH12941, use None to auto cast null result[null_mask] = None elif result is not NaT: - if mask is not None: - null_mask = mask.size - mask.sum() - else: - null_mask = np.prod(shape) - if null_mask < min_count: + if _below_min_count(shape, mask, min_count): result = np.nan return result +def _below_min_count(shape, mask, min_count): + """ + Check for the `min_count` keyword. Returns True if below `min_count` (when + missing value should be returned from the reduction). + """ + if min_count > 0: + if mask is None: + # no missing values, only check size + non_nulls = np.prod(shape) + else: + non_nulls = mask.size - mask.sum() + if non_nulls < min_count: + return True + return False + + def _zero_out_fperr(arg): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 5e760e3d6da93..90e4bf1dece83 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -8,6 +8,8 @@ from pandas._libs import lib, missing as libmissing from pandas.compat.numpy import _np_version_under1p17 +from pandas.core.nanops import _below_min_count + def kleene_or( left: Union[bool, np.ndarray], @@ -202,30 +204,14 @@ def sum( if mask.any(): return libmissing.NA else: - if _below_min_count(values, None, min_count): + if _below_min_count(values.shape, None, min_count): return libmissing.NA return np.sum(values) else: - if _below_min_count(values, mask, min_count): + if _below_min_count(values.shape, mask, min_count): return libmissing.NA if _np_version_under1p17: return np.sum(values[~mask]) else: return np.sum(values, where=~mask) - - -def _below_min_count(values, mask, min_count): - """ - Check for the `min_count` keyword. Returns True if below `min_count` (when - pd.NA should be returned from the reduction). - """ - if min_count > 0: - if mask is None: - # no missing values, only check size - non_nulls = values.size - else: - non_nulls = mask.size - mask.sum() - if non_nulls < min_count: - return True - return False From 4df858fdeab2c624d059a2a22d4d6dadaf7c131a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 Mar 2020 10:07:59 +0100 Subject: [PATCH 12/18] type + add docstring for min_count --- pandas/core/nanops.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index ea4333c1e8b3e..c0afb6285715a 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1238,7 +1238,7 @@ def _maybe_null_out( result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], - shape: Tuple, + shape: Tuple[int], min_count: int = 1, ) -> float: """ @@ -1266,10 +1266,23 @@ def _maybe_null_out( return result -def _below_min_count(shape, mask, min_count): +def _below_min_count(shape: Tuple[int], mask: Optional[np.ndarray], min_count: int): """ Check for the `min_count` keyword. Returns True if below `min_count` (when missing value should be returned from the reduction). + + Parameters + ---------- + shape : tuple + The shape of the values (`values.shape`). + mask : ndarray or None + Boolean numpy array (typically of same shape as `shape`) or None. + min_count : int + Keyword passed through from sum/prod call. + + Returns + ------- + bool """ if min_count > 0: if mask is None: From 76c5149799dfa00cf5258237f69529010dfee492 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Mar 2020 14:03:26 +0100 Subject: [PATCH 13/18] move sum algo from ops to array_algos --- pandas/core/array_algos/masked_reductions.py | 47 ++++++++++++++++++++ pandas/core/arrays/boolean.py | 4 +- pandas/core/arrays/integer.py | 5 ++- pandas/core/ops/mask_ops.py | 39 ---------------- 4 files changed, 52 insertions(+), 43 deletions(-) create mode 100644 pandas/core/array_algos/masked_reductions.py diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py new file mode 100644 index 0000000000000..d9fa4cf3fb3f5 --- /dev/null +++ b/pandas/core/array_algos/masked_reductions.py @@ -0,0 +1,47 @@ +""" +masked_reductions.py is for reduction algorithms using a mask-based approach +for missing values. +""" + +import numpy as np + +from pandas._libs import missing as libmissing +from pandas.compat.numpy import _np_version_under1p17 + +from pandas.core.nanops import _below_min_count + + +def sum( + values: np.ndarray, mask: np.ndarray, skipna: bool, min_count: int = 0, +): + """ + Sum for 1D masked array. + + Parameters + ---------- + values : np.ndarray + Numpy array with the values (can be of any dtype that support the + operation). + mask : np.ndarray + Boolean numpy array (True values indicate missing values). + skipna : bool, default True + Whether to skip NA. + min_count : int, default 0 + The required number of valid values to perform the operation. If fewer than + ``min_count`` non-NA values are present the result will be NA. + """ + if not skipna: + if mask.any(): + return libmissing.NA + else: + if _below_min_count(values.shape, None, min_count): + return libmissing.NA + return np.sum(values) + else: + if _below_min_count(values.shape, mask, min_count): + return libmissing.NA + + if _np_version_under1p17: + return np.sum(values[~mask]) + else: + return np.sum(values, where=~mask) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dbb6cb212cb97..442d4ca8cef6d 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -27,8 +27,8 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions from pandas.core.indexers import check_array_indexer -from pandas.core.ops import mask_ops from .masked import BaseMaskedArray @@ -697,7 +697,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): mask = self._mask if name == "sum": - return mask_ops.sum(data, mask, skipna=skipna, **kwargs) + return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed if self._hasna: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 67c06bd5e9b08..4f3c68aa03b16 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -27,9 +27,10 @@ from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions import pandas.core.common as com from pandas.core.indexers import check_array_indexer -from pandas.core.ops import invalid_comparison, mask_ops +from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -561,7 +562,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs): mask = self._mask if name == "sum": - return mask_ops.sum(data, mask, skipna=skipna, **kwargs) + return masked_reductions.sum(data, mask, skipna=skipna, **kwargs) # coerce to a nan-aware float if needed # (we explicitly use NaN within reductions) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index 90e4bf1dece83..8fb81faf313d7 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -6,9 +6,6 @@ import numpy as np from pandas._libs import lib, missing as libmissing -from pandas.compat.numpy import _np_version_under1p17 - -from pandas.core.nanops import _below_min_count def kleene_or( @@ -179,39 +176,3 @@ def kleene_and( def raise_for_nan(value, method): if lib.is_float(value) and np.isnan(value): raise ValueError(f"Cannot perform logical '{method}' with floating NaN") - - -def sum( - values: np.ndarray, mask: np.ndarray, skipna: bool, min_count: int = 0, -): - """ - Sum for 1D masked array. - - Parameters - ---------- - values : np.ndarray - Numpy array with the values (can be of any dtype that support the - operation). - mask : np.ndarray - Boolean numpy array (True values indicate missing values). - skipna : bool, default True - Whether to skip NA. - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer than - ``min_count`` non-NA values are present the result will be NA. - """ - if not skipna: - if mask.any(): - return libmissing.NA - else: - if _below_min_count(values.shape, None, min_count): - return libmissing.NA - return np.sum(values) - else: - if _below_min_count(values.shape, mask, min_count): - return libmissing.NA - - if _np_version_under1p17: - return np.sum(values[~mask]) - else: - return np.sum(values, where=~mask) From b2162dc9f1e88c5670b7cb0f3065076c470fe097 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Mar 2020 10:22:33 +0100 Subject: [PATCH 14/18] add Int64/boolean to some benchmarks --- asv_bench/benchmarks/series_methods.py | 25 ++++++++++++++----------- asv_bench/benchmarks/stat_ops.py | 10 ++++++++-- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 57c625ced8a43..d78419c12ce0d 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -223,27 +223,27 @@ def time_series_datetimeindex_repr(self): class All: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case != "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_all(self, N, case): + def time_all(self, N, case, dtype): self.s.all() class Any: - params = [[10 ** 3, 10 ** 6], ["fast", "slow"]] - param_names = ["N", "case"] + params = [[10 ** 3, 10 ** 6], ["fast", "slow"], ["bool", "boolean"]] + param_names = ["N", "case", "dtype"] - def setup(self, N, case): + def setup(self, N, case, dtype): val = case == "fast" - self.s = Series([val] * N) + self.s = Series([val] * N, dtype=dtype) - def time_any(self, N, case): + def time_any(self, N, case, dtype): self.s.any() @@ -265,11 +265,14 @@ class NanOps: "prod", ], [10 ** 3, 10 ** 6], - ["int8", "int32", "int64", "float64"], + ["int8", "int32", "int64", "float64", "Int64", "boolean"], ] param_names = ["func", "N", "dtype"] def setup(self, func, N, dtype): + if func == "argmax" and dtype in {"Int64", "boolean"}: + # Skip argmax for nullable int since this doesn't work yet (GH-24382) + raise NotImplementedError self.s = Series([1] * N, dtype=dtype) self.func = getattr(self.s, func) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index ec67394e55a1e..ebbd3c9eddfdb 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -7,11 +7,17 @@ class FrameOps: - params = [ops, ["float", "int"], [0, 1]] + params = [ops, ["float", "int", "Int64"], [0, 1]] param_names = ["op", "dtype", "axis"] def setup(self, op, dtype, axis): - df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + if op == "mad" and dtype == "Int64" and axis == 1: + # GH-33036 + raise NotImplementedError + values = np.random.randn(100000, 4) + if dtype == "Int64": + values = values.astype(int) + df = pd.DataFrame(values).astype(dtype) self.df_func = getattr(df, op) def time_op(self, op, dtype, axis): From d4746f5586219b5ecd803c1511a2e0adaf8f0cd2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Mar 2020 10:24:00 +0100 Subject: [PATCH 15/18] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8cb80c7c92f8e..4cfd47894a776 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -229,6 +229,8 @@ Performance improvements sparse values from ``scipy.sparse`` matrices using the :meth:`DataFrame.sparse.from_spmatrix` constructor (:issue:`32821`, :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). +- Performance improvement in :meth:`Series.sum` for nullable (integer and boolean) dtypes (:issue:`30982`). + .. --------------------------------------------------------------------------- From d9c2cbfcd527242db4bf67622a14fe524021fedb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Mar 2020 11:20:31 +0100 Subject: [PATCH 16/18] add skipna default in function signature --- pandas/core/array_algos/masked_reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index d9fa4cf3fb3f5..44b648b762a86 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -12,7 +12,7 @@ def sum( - values: np.ndarray, mask: np.ndarray, skipna: bool, min_count: int = 0, + values: np.ndarray, mask: np.ndarray, skipna: bool = True, min_count: int = 0, ): """ Sum for 1D masked array. From f8705c269ad5d5432b84bf08c8fe043933b695e4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Mar 2020 17:16:34 +0100 Subject: [PATCH 17/18] update type hint + deprivatize --- pandas/core/array_algos/masked_reductions.py | 6 +++--- pandas/core/nanops.py | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 44b648b762a86..0fb2605b554c2 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -8,7 +8,7 @@ from pandas._libs import missing as libmissing from pandas.compat.numpy import _np_version_under1p17 -from pandas.core.nanops import _below_min_count +from pandas.core.nanops import check_below_min_count def sum( @@ -34,11 +34,11 @@ def sum( if mask.any(): return libmissing.NA else: - if _below_min_count(values.shape, None, min_count): + if check_below_min_count(values.shape, None, min_count): return libmissing.NA return np.sum(values) else: - if _below_min_count(values.shape, mask, min_count): + if check_below_min_count(values.shape, mask, min_count): return libmissing.NA if _np_version_under1p17: diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c0afb6285715a..46eec67b0f428 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1260,13 +1260,15 @@ def _maybe_null_out( # GH12941, use None to auto cast null result[null_mask] = None elif result is not NaT: - if _below_min_count(shape, mask, min_count): + if check_below_min_count(shape, mask, min_count): result = np.nan return result -def _below_min_count(shape: Tuple[int], mask: Optional[np.ndarray], min_count: int): +def check_below_min_count( + shape: Tuple[int, ...], mask: Optional[np.ndarray], min_count: int +): """ Check for the `min_count` keyword. Returns True if below `min_count` (when missing value should be returned from the reduction). From 1a43e1058cdc12fc0a05e7a990a16e11f463e45d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Mar 2020 17:17:44 +0100 Subject: [PATCH 18/18] update another type hint --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 46eec67b0f428..822ab775e7e46 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1238,7 +1238,7 @@ def _maybe_null_out( result: np.ndarray, axis: Optional[int], mask: Optional[np.ndarray], - shape: Tuple[int], + shape: Tuple[int, ...], min_count: int = 1, ) -> float: """