API: Use object dtype for empty Series (pandas-dev#29405)

proost · Dec 19, 2019 · fe74426 · fe74426
1 parent 5d24c72
commit fe74426
Show file tree

Hide file tree

Showing 82 changed files with 444 additions and 247 deletions.
diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst
@@ -190,15 +190,15 @@ The sum of an empty or all-NA Series or column of a DataFrame is 0.
 
    pd.Series([np.nan]).sum()
 
-   pd.Series([]).sum()
+   pd.Series([], dtype="float64").sum()
 
 The product of an empty or all-NA Series or column of a DataFrame is 1.
 
 .. ipython:: python
 
    pd.Series([np.nan]).prod()
 
-   pd.Series([]).prod()
+   pd.Series([], dtype="float64").prod()
 
 
 NA values in GroupBy

diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
@@ -358,6 +358,7 @@ results will fit in memory, so we can safely call ``compute`` without running
 out of memory. At that point it's just a regular pandas object.
 
 .. ipython:: python
+   :okwarning:
 
    @savefig dask_resample.png
    ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot()

diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst
@@ -707,6 +707,7 @@ A ``Series`` will now correctly promote its dtype for assignment with incompat v
 
 
 .. ipython:: python
+   :okwarning:
 
    s = pd.Series()
 

diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst
@@ -428,6 +428,7 @@ Note that this also changes the sum of an empty ``Series``. Previously this alwa
 but for consistency with the all-NaN case, this was changed to return NaN as well:
 
 .. ipython:: python
+   :okwarning:
 
    pd.Series([]).sum()
 

diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst
@@ -55,6 +55,7 @@ The default sum for empty or all-*NA* ``Series`` is now ``0``.
 *pandas 0.22.0*
 
 .. ipython:: python
+   :okwarning:
 
    pd.Series([]).sum()
    pd.Series([np.nan]).sum()
@@ -67,6 +68,7 @@ pandas 0.20.3 without bottleneck, or pandas 0.21.x), use the ``min_count``
 keyword.
 
 .. ipython:: python
+   :okwarning:
 
    pd.Series([]).sum(min_count=1)
 
@@ -85,6 +87,7 @@ required for a non-NA sum or product.
 returning ``1`` instead.
 
 .. ipython:: python
+   :okwarning:
 
    pd.Series([]).prod()
    pd.Series([np.nan]).prod()

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -366,6 +366,23 @@ When :class:`Categorical` contains ``np.nan``,
 
    pd.Categorical([1, 2, np.nan], ordered=True).min()
 
+
+Default dtype of empty :class:`pandas.Series`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Initialising an empty :class:`pandas.Series` without specifying a dtype will raise a `DeprecationWarning` now
+(:issue:`17261`). The default dtype will change from ``float64`` to ``object`` in future releases so that it is
+consistent with the behaviour of :class:`DataFrame` and :class:`Index`.
+
+*pandas 1.0.0*
+
+.. code-block:: ipython
+
+   In [1]: pd.Series()
+   Out[2]:
+   DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
+   Series([], dtype: float64)
+
 .. _whatsnew_1000.api_breaking.deps:
 
 Increased minimum versions for dependencies
@@ -494,7 +511,7 @@ Removal of prior version deprecations/changes
 
 Previously, pandas would register converters with matplotlib as a side effect of importing pandas (:issue:`18720`).
 This changed the output of plots made via matplotlib plots after pandas was imported, even if you were using
-matplotlib directly rather than rather than :meth:`~DataFrame.plot`.
+matplotlib directly rather than :meth:`~DataFrame.plot`.
 
 To use pandas formatters with a matplotlib plot, specify
 

diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -64,7 +64,7 @@ def __new__(cls) -> "Series":  # type: ignore
             stacklevel=6,
         )
 
-        return Series()
+        return Series(dtype=object)
 
 
 class _LoadSparseFrame:

diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -15,6 +15,8 @@
 )
 from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries
 
+from pandas.core.construction import create_series_with_explicit_dtype
+
 if TYPE_CHECKING:
     from pandas import DataFrame, Series, Index
 
@@ -203,15 +205,15 @@ def apply_empty_result(self):
 
         if not should_reduce:
             try:
-                r = self.f(Series([]))
+                r = self.f(Series([], dtype=np.float64))
             except Exception:
                 pass
             else:
                 should_reduce = not isinstance(r, Series)
 
         if should_reduce:
             if len(self.agg_axis):
-                r = self.f(Series([]))
+                r = self.f(Series([], dtype=np.float64))
             else:
                 r = np.nan
 
@@ -346,14 +348,25 @@ def apply_series_generator(self) -> Tuple[ResType, "Index"]:
     def wrap_results(
         self, results: ResType, res_index: "Index"
     ) -> Union["Series", "DataFrame"]:
+        from pandas import Series
 
         # see if we can infer the results
         if len(results) > 0 and 0 in results and is_sequence(results[0]):
 
             return self.wrap_results_for_axis(results, res_index)
 
         # dict of scalars
-        result = self.obj._constructor_sliced(results)
+
+        # the default dtype of an empty Series will be `object`, but this
+        # code can be hit by df.mean() where the result should have dtype
+        # float64 even if it's an empty Series.
+        constructor_sliced = self.obj._constructor_sliced
+        if constructor_sliced is Series:
+            result = create_series_with_explicit_dtype(
+                results, dtype_if_empty=np.float64
+            )
+        else:
+            result = constructor_sliced(results)
         result.index = res_index
 
         return result

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -34,6 +34,7 @@
 from pandas.core.accessor import DirNamesMixin
 from pandas.core.algorithms import duplicated, unique1d, value_counts
 from pandas.core.arrays import ExtensionArray
+from pandas.core.construction import create_series_with_explicit_dtype
 import pandas.core.nanops as nanops
 
 _shared_docs: Dict[str, str] = dict()
@@ -1132,9 +1133,14 @@ def _map_values(self, mapper, na_action=None):
                 # convert to an Series for efficiency.
                 # we specify the keys here to handle the
                 # possibility that they are tuples
-                from pandas import Series
 
-                mapper = Series(mapper)
+                # The return value of mapping with an empty mapper is
+                # expected to be pd.Series(np.nan, ...). As np.nan is
+                # of dtype float64 the return value of this method should
+                # be float64 as well
+                mapper = create_series_with_explicit_dtype(
+                    mapper, dtype_if_empty=np.float64
+                )
 
         if isinstance(mapper, ABCSeries):
             # Since values were input this means we came from either

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -4,7 +4,7 @@
 
 These should not depend on core.internals.
 """
-from typing import Optional, Sequence, Union, cast
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast
 
 import numpy as np
 import numpy.ma as ma
@@ -44,8 +44,13 @@
 )
 from pandas.core.dtypes.missing import isna
 
+from pandas._typing import ArrayLike, Dtype
 import pandas.core.common as com
 
+if TYPE_CHECKING:
+    from pandas.core.series import Series  # noqa: F401
+    from pandas.core.index import Index  # noqa: F401
+
 
 def array(
     data: Sequence[object],
@@ -565,3 +570,62 @@ def _try_cast(
         else:
             subarr = np.array(arr, dtype=object, copy=copy)
     return subarr
+
+
+def is_empty_data(data: Any) -> bool:
+    """
+    Utility to check if a Series is instantiated with empty data,
+    which does not contain dtype information.
+
+    Parameters
+    ----------
+    data : array-like, Iterable, dict, or scalar value
+        Contains data stored in Series.
+
+    Returns
+    -------
+    bool
+    """
+    is_none = data is None
+    is_list_like_without_dtype = is_list_like(data) and not hasattr(data, "dtype")
+    is_simple_empty = is_list_like_without_dtype and not data
+    return is_none or is_simple_empty
+
+
+def create_series_with_explicit_dtype(
+    data: Any = None,
+    index: Optional[Union[ArrayLike, "Index"]] = None,
+    dtype: Optional[Dtype] = None,
+    name: Optional[str] = None,
+    copy: bool = False,
+    fastpath: bool = False,
+    dtype_if_empty: Dtype = object,
+) -> "Series":
+    """
+    Helper to pass an explicit dtype when instantiating an empty Series.
+
+    This silences a DeprecationWarning described in GitHub-17261.
+
+    Parameters
+    ----------
+    data : Mirrored from Series.__init__
+    index : Mirrored from Series.__init__
+    dtype : Mirrored from Series.__init__
+    name : Mirrored from Series.__init__
+    copy : Mirrored from Series.__init__
+    fastpath : Mirrored from Series.__init__
+    dtype_if_empty : str, numpy.dtype, or ExtensionDtype
+        This dtype will be passed explicitly if an empty Series will
+        be instantiated.
+
+    Returns
+    -------
+    Series
+    """
+    from pandas.core.series import Series
+
+    if is_empty_data(data) and dtype is None:
+        dtype = dtype_if_empty
+    return Series(
+        data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
+    )
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -7956,7 +7956,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"):
             cols = Index([], name=self.columns.name)
             if is_list_like(q):
                 return self._constructor([], index=q, columns=cols)
-            return self._constructor_sliced([], index=cols, name=q)
+            return self._constructor_sliced([], index=cols, name=q, dtype=np.float64)
 
         result = data._data.quantile(
             qs=q, axis=1, interpolation=interpolation, transposed=is_transposed

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -72,6 +72,7 @@
 import pandas.core.algorithms as algos
 from pandas.core.base import PandasObject, SelectionMixin
 import pandas.core.common as com
+from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.index import (
     Index,
     InvalidIndexError,
@@ -6042,9 +6043,9 @@ def fillna(
 
             if self.ndim == 1:
                 if isinstance(value, (dict, ABCSeries)):
-                    from pandas import Series
-
-                    value = Series(value)
+                    value = create_series_with_explicit_dtype(
+                        value, dtype_if_empty=object
+                    )
                 elif not is_list_like(value):
                     pass
                 else:
@@ -6996,7 +6997,7 @@ def asof(self, where, subset=None):
                 if not is_series:
                     from pandas import Series
 
-                    return Series(index=self.columns, name=where)
+                    return Series(index=self.columns, name=where, dtype=np.float64)
                 return np.nan
 
             # It's always much faster to use a *while* loop here for

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -51,6 +51,7 @@
 import pandas.core.algorithms as algorithms
 from pandas.core.base import DataError, SpecificationError
 import pandas.core.common as com
+from pandas.core.construction import create_series_with_explicit_dtype
 from pandas.core.frame import DataFrame
 from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs
 from pandas.core.groupby import base
@@ -259,7 +260,9 @@ def aggregate(self, func=None, *args, **kwargs):
                 result = self._aggregate_named(func, *args, **kwargs)
 
             index = Index(sorted(result), name=self.grouper.names[0])
-            ret = Series(result, index=index)
+            ret = create_series_with_explicit_dtype(
+                result, index=index, dtype_if_empty=object
+            )
 
         if not self.as_index:  # pragma: no cover
             print("Warning, ignoring as_index=True")
@@ -407,7 +410,7 @@ def _wrap_transformed_output(
     def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if len(keys) == 0:
             # GH #6265
-            return Series([], name=self._selection_name, index=keys)
+            return Series([], name=self._selection_name, index=keys, dtype=np.float64)
 
         def _get_index() -> Index:
             if self.grouper.nkeys > 1:
@@ -493,7 +496,7 @@ def _transform_general(self, func, *args, **kwargs):
 
             result = concat(results).sort_index()
         else:
-            result = Series()
+            result = Series(dtype=np.float64)
 
         # we will only try to coerce the result type if
         # we have a numeric dtype, as these are *always* user-defined funcs
@@ -1205,10 +1208,18 @@ def first_not_none(values):
             if v is None:
                 return DataFrame()
             elif isinstance(v, NDFrame):
-                values = [
-                    x if x is not None else v._constructor(**v._construct_axes_dict())
-                    for x in values
-                ]
+
+                # this is to silence a DeprecationWarning
+                # TODO: Remove when default dtype of empty Series is object
+                kwargs = v._construct_axes_dict()
+                if v._constructor is Series:
+                    backup = create_series_with_explicit_dtype(
+                        **kwargs, dtype_if_empty=object
+                    )
+                else:
+                    backup = v._constructor(**kwargs)
+
+                values = [x if (x is not None) else backup for x in values]
 
             v = values[0]