pandas-dev · jbrockmendel · Mar 27, 2020 · Jan 13, 2020 · Feb 13, 2020 · Feb 13, 2020
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -28,6 +28,7 @@
 
 from pandas.core import nanops, ops
 from pandas.core.indexers import check_array_indexer
+from pandas.core.ops import mask_ops
 
 from .masked import BaseMaskedArray
 
@@ -697,6 +698,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
         data = self._data
         mask = self._mask
 
+        if name == "sum":
+            return mask_ops.sum(data, mask, skipna=skipna, **kwargs)
+
         # coerce to a nan-aware float if needed
         if self._hasna:
             data = self.to_numpy("float64", na_value=np.nan)
@@ -708,7 +712,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
             return libmissing.NA
 
         # if we have numeric op that would result in an int, coerce to int if possible
-        if name in ["sum", "prod"] and notna(result):
+        if name == "prod" and notna(result):
             int_result = np.int64(result)
             if int_result == result:
                 result = int_result

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -28,7 +28,7 @@
 from pandas.core import nanops, ops
 import pandas.core.common as com
 from pandas.core.indexers import check_array_indexer
-from pandas.core.ops import invalid_comparison
+from pandas.core.ops import invalid_comparison, mask_ops
 from pandas.core.ops.common import unpack_zerodim_and_defer
 from pandas.core.tools.numeric import to_numeric
 
@@ -567,6 +567,9 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
         data = self._data
         mask = self._mask
 
+        if name == "sum":
+            return mask_ops.sum(data, mask, skipna=skipna, **kwargs)
+
         # coerce to a nan-aware float if needed
         # (we explicitly use NaN within reductions)
         if self._hasna:
@@ -584,7 +587,7 @@ def _reduce(self, name: str, skipna: bool = True, **kwargs):
 
         # if we have a preservable numeric op,
         # provide coercion back to an integer type if possible
-        elif name in ["sum", "min", "max", "prod"]:
+        elif name in ["min", "max", "prod"]:
             # GH#31409 more performant than casting-then-checking
             result = com.cast_scalar_indexer(result)
 

diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
+from pandas.compat.numpy import _np_version_under1p17
 
 
 def kleene_or(
@@ -176,3 +177,55 @@ def kleene_and(
 def raise_for_nan(value, method):
     if lib.is_float(value) and np.isnan(value):
         raise ValueError(f"Cannot perform logical '{method}' with floating NaN")
+
+
+def sum(
+    values: np.ndarray, mask: np.ndarray, skipna: bool, min_count: int = 0,
+):
+    """
+    Sum for 1D masked array.
+
+    Parameters
+    ----------
+    values : np.ndarray
+        Numpy array with the values (can be of any dtype that support the
+        operation).
+    mask : np.ndarray
+        Boolean numpy array (False for missing)
+    skipna : bool, default True
+        Whether to skip NA.
+    min_count : int, default 0
+        The required number of valid values to perform the operation. If fewer than
+        ``min_count`` non-NA values are present the result will be NA.
+    """
+    if not skipna:
+        if mask.any():
+            return libmissing.NA
+        else:
+            if _below_min_count(values, None, min_count):
+                return libmissing.NA
+            return np.sum(values)
+    else:
+        if _below_min_count(values, mask, min_count):
+            return libmissing.NA
+
+        if _np_version_under1p17:
+            return np.sum(values[~mask])
+        else:
+            return np.sum(values, where=~mask)
+
+
+def _below_min_count(values, mask, min_count):
 def _maybe_null_out( 
     result: np.ndarray, 
     axis: Optional[int], 
     mask: Optional[np.ndarray], 
     shape: Tuple, 
     min_count: int = 1, 
 ) -> float: 
     """ 
     Returns 
     ------- 
     Dtype 
         The product of all elements on a given axis. ( NaNs are treated as 1) 
     """ 
     if mask is not None and axis is not None and getattr(result, "ndim", False): 
         null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 
         if np.any(null_mask): 
             if is_numeric_dtype(result): 
                 if np.iscomplexobj(result): 
                     result = result.astype("c16") 
                 else: 
                     result = result.astype("f8") 
                 result[null_mask] = np.nan 
             else: 
                 # GH12941, use None to auto cast null 
                 result[null_mask] = None 
     elif result is not NaT: 
         if mask is not None: 
             null_mask = mask.size - mask.sum() 
         else: 
             null_mask = np.prod(shape) 
         if null_mask < min_count: 
             result = np.nan 
     return result 
 def _maybe_null_out( 
     result: np.ndarray, 
     axis: Optional[int], 
     mask: Optional[np.ndarray], 
     shape: Tuple, 
     min_count: int = 1, 
 ) -> float: 
     """ 
     Returns 
     ------- 
     Dtype 
         The product of all elements on a given axis. ( NaNs are treated as 1) 
     """ 
     if mask is not None and axis is not None and getattr(result, "ndim", False): 
         null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 
         if np.any(null_mask): 
             if is_numeric_dtype(result): 
                 if np.iscomplexobj(result): 
                     result = result.astype("c16") 
                 else: 
                     result = result.astype("f8") 
                 result[null_mask] = np.nan 
             else: 
                 # GH12941, use None to auto cast null 
                 result[null_mask] = None 
     elif result is not NaT: 
         if mask is not None: 
             null_mask = mask.size - mask.sum() 
         else: 
             null_mask = np.prod(shape) 
         if null_mask < min_count: 
             result = np.nan 
  
     return result 
+    """
+    Check for the `min_count` keyword. Returns True if below `min_count` (when
+    pd.NA should be returned from the reduction).
+    """
+    if min_count > 0:
+        if mask is None:
+            # no missing values, only check size
+            non_nulls = values.size
+        else:
+            non_nulls = mask.size - mask.sum()
+        if non_nulls < min_count:
+            return True
+    return False
diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py
@@ -911,7 +911,10 @@ def test_preserve_dtypes(op):
 
     # op
     result = getattr(df.C, op)()
-    assert isinstance(result, int)
+    if op == "sum":
+        assert isinstance(result, np.int64)
+    else:
+        assert isinstance(result, int)
 
     # groupby
     result = getattr(df.groupby("A"), op)()

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
@@ -528,13 +528,14 @@ def test_sum_inf(self):
         res = nanops.nansum(arr, axis=1)
         assert np.isinf(res).all()
 
+    @pytest.mark.parametrize("dtype", ["float64", "Int64", "boolean"])
     @pytest.mark.parametrize("use_bottleneck", [True, False])
     @pytest.mark.parametrize("method, unit", [("sum", 0.0), ("prod", 1.0)])
-    def test_empty(self, method, unit, use_bottleneck):
+    def test_empty(self, method, unit, use_bottleneck, dtype):
         with pd.option_context("use_bottleneck", use_bottleneck):
             # GH#9422 / GH#18921
             # Entirely empty
-            s = Series([], dtype=object)
+            s = Series([], dtype=dtype)
             # NA by default
             result = getattr(s, method)()
             assert result == unit
@@ -557,8 +558,14 @@ def test_empty(self, method, unit, use_bottleneck):
             result = getattr(s, method)(skipna=True, min_count=1)
             assert pd.isna(result)
 
+            result = getattr(s, method)(skipna=False, min_count=0)
+            assert result == unit
+
+            result = getattr(s, method)(skipna=False, min_count=1)
+            assert pd.isna(result)
+
             # All-NA
-            s = Series([np.nan])
+            s = Series([np.nan], dtype=dtype)
             # NA by default
             result = getattr(s, method)()
             assert result == unit
@@ -582,7 +589,7 @@ def test_empty(self, method, unit, use_bottleneck):
             assert pd.isna(result)
 
             # Mix of valid, empty
-            s = Series([np.nan, 1])
+            s = Series([np.nan, 1], dtype=dtype)
             # Default
             result = getattr(s, method)()
             assert result == 1.0
@@ -601,22 +608,22 @@ def test_empty(self, method, unit, use_bottleneck):
             result = getattr(s, method)(skipna=True, min_count=0)
             assert result == 1.0
 
-            result = getattr(s, method)(skipna=True, min_count=1)
-            assert result == 1.0
-
             # GH#844 (changed in GH#9422)
-            df = DataFrame(np.empty((10, 0)))
+            df = DataFrame(np.empty((10, 0)), dtype=dtype)
             assert (getattr(df, method)(1) == unit).all()
 
-            s = pd.Series([1])
+            s = pd.Series([1], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)
 
-            s = pd.Series([np.nan])
+            result = getattr(s, method)(skipna=False, min_count=2)
+            assert pd.isna(result)
+
+            s = pd.Series([np.nan], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)
 
-            s = pd.Series([np.nan, 1])
+            s = pd.Series([np.nan, 1], dtype=dtype)
             result = getattr(s, method)(min_count=2)
             assert pd.isna(result)