Merge remote-tracking branch 'upstream/master' into STY-repr-batch-5

pandas-dev · Dec 2, 2019 · 3f5fa5f · 3f5fa5f
2 parents 8047860 + 83812e1
commit 3f5fa5f
Show file tree

Hide file tree

Showing 26 changed files with 320 additions and 212 deletions.
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
@@ -25,8 +25,7 @@ numbers.
 
 Pandas can represent integer data with possibly missing values using
 :class:`arrays.IntegerArray`. This is an :ref:`extension types <extending.extension-types>`
-implemented within pandas. It is not the default dtype for integers, and will not be inferred;
-you must explicitly pass the dtype into :meth:`array` or :class:`Series`:
+implemented within pandas.
 
 .. ipython:: python
 
@@ -50,24 +49,43 @@ NumPy array.
 You can also pass the list-like object to the :class:`Series` constructor
 with the dtype.
 
-.. ipython:: python
+.. warning::
 
-   s = pd.Series([1, 2, np.nan], dtype="Int64")
-   s
+   Currently :meth:`pandas.array` and :meth:`pandas.Series` use different
+   rules for dtype inference. :meth:`pandas.array` will infer a nullable-
+   integer dtype
 
-By default (if you don't specify ``dtype``), NumPy is used, and you'll end
-up with a ``float64`` dtype Series:
+   .. ipython:: python
 
-.. ipython:: python
+      pd.array([1, None])
+      pd.array([1, 2])
+
+   For backwards-compatibility, :class:`Series` infers these as either
+   integer or float dtype
+
+   .. ipython:: python
+
+      pd.Series([1, None])
+      pd.Series([1, 2])
 
-   pd.Series([1, 2, np.nan])
+   We recommend explicitly providing the dtype to avoid confusion.
+
+   .. ipython:: python
+
+      pd.array([1, None], dtype="Int64")
+      pd.Series([1, None], dtype="Int64")
+
+   In the future, we may provide an option for :class:`Series` to infer a
+   nullable-integer dtype.
 
 Operations involving an integer array will behave similar to NumPy arrays.
 Missing values will be propagated, and the data will be coerced to another
 dtype if needed.
 
 .. ipython:: python
 
+   s = pd.Series([1, 2, None], dtype="Int64")
+
    # arithmetic
    s + 1
 

diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
@@ -677,7 +677,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Notice that you're able share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon."
+    "Notice that you're able to share the styles even though they're data aware. The styles are re-evaluated on the new DataFrame they've been `use`d upon."
    ]
   },
   {

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -303,6 +303,58 @@ The following methods now also correctly output values for unobserved categories
 
    df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
 
+:meth:`pandas.array` inference changes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`pandas.array` now infers pandas' new extension types in several cases (:issue:`29791`):
+
+1. String data (including missing values) now returns a :class:`arrays.StringArray`.
+2. Integer data (including missing values) now returns a :class:`arrays.IntegerArray`.
+3. Boolean data (including missing values) now returns the new :class:`arrays.BooleanArray`
+
+*pandas 0.25.x*
+
+.. code-block:: python
+
+   >>> pd.array(["a", None])
+   <PandasArray>
+   ['a', None]
+   Length: 2, dtype: object
+
+   >>> pd.array([1, None])
+   <PandasArray>
+   [1, None]
+   Length: 2, dtype: object
+
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+   pd.array(["a", None])
+   pd.array([1, None])
+
+As a reminder, you can specify the ``dtype`` to disable all inference.
+
+By default :meth:`Categorical.min` now returns the minimum instead of np.nan
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When :class:`Categorical` contains ``np.nan``,
+:meth:`Categorical.min` no longer return ``np.nan`` by default (skipna=True) (:issue:`25303`)
+
+*pandas 0.25.x*
+
+.. code-block:: ipython
+
+   In [1]: pd.Categorical([1, 2, np.nan], ordered=True).min()
+   Out[1]: nan
+
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+   pd.Categorical([1, 2, np.nan], ordered=True).min()
 
 .. _whatsnew_1000.api_breaking.deps:
 
@@ -388,7 +440,6 @@ Other API changes
 - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter.
   Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`)
 - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`)
--
 
 
 .. _whatsnew_1000.api.documentation:
@@ -410,6 +461,8 @@ Deprecations
 - :func:`is_extension_type` is deprecated, :func:`is_extension_array_dtype` should be used instead (:issue:`29457`)
 - :func:`eval` keyword argument "truediv" is deprecated and will be removed in a future version (:issue:`29812`)
 - :meth:`Categorical.take_nd` is deprecated, use :meth:`Categorical.take` instead (:issue:`27745`)
+- The parameter ``numeric_only`` of :meth:`Categorical.min` and :meth:`Categorical.max` is deprecated and replaced with ``skipna`` (:issue:`25303`)
+-
 
 .. _whatsnew_1000.prior_deprecations:
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -1313,7 +1313,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
 
     elif isinstance(val, str):
         if is_string_array(values, skipna=skipna):
-            return 'string'
+            return "string"
 
     elif isinstance(val, bytes):
         if is_bytes_array(values, skipna=skipna):

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -451,7 +451,9 @@ def _values_for_argsort(self) -> np.ndarray:
         # Note: this is used in `ExtensionArray.argsort`.
         return np.array(self)
 
-    def argsort(self, ascending=True, kind="quicksort", *args, **kwargs):
+    def argsort(
+        self, ascending: bool = True, kind: str = "quicksort", *args, **kwargs
+    ) -> np.ndarray:
         """
         Return the indices that would sort this array.
 
@@ -467,7 +469,7 @@ def argsort(self, ascending=True, kind="quicksort", *args, **kwargs):
 
         Returns
         -------
-        index_array : ndarray
+        ndarray
             Array of indices that sort ``self``. If NaN values are contained,
             NaN values are placed at the end.
 
@@ -1198,10 +1200,9 @@ def _maybe_convert(arr):
 
             if op.__name__ in {"divmod", "rdivmod"}:
                 a, b = zip(*res)
-                res = _maybe_convert(a), _maybe_convert(b)
-            else:
-                res = _maybe_convert(res)
-            return res
+                return _maybe_convert(a), _maybe_convert(b)
+
+            return _maybe_convert(res)
 
         op_name = ops._get_op_name(op, True)
         return set_function_name(_binop, op_name, cls)

diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -2123,7 +2123,8 @@ def _reduce(self, name, axis=0, **kwargs):
             raise TypeError(f"Categorical cannot perform the operation {name}")
         return func(**kwargs)
 
-    def min(self, numeric_only=None, **kwargs):
+    @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
+    def min(self, skipna=True):
         """
         The minimum value of the object.
 
@@ -2139,17 +2140,18 @@ def min(self, numeric_only=None, **kwargs):
         min : the minimum of this `Categorical`
         """
         self.check_for_ordered("min")
-        if numeric_only:
-            good = self._codes != -1
-            pointer = self._codes[good].min(**kwargs)
-        else:
-            pointer = self._codes.min(**kwargs)
-        if pointer == -1:
-            return np.nan
+        good = self._codes != -1
+        if not good.all():
+            if skipna:
+                pointer = self._codes[good].min()
+            else:
+                return np.nan
         else:
-            return self.categories[pointer]
+            pointer = self._codes.min()
+        return self.categories[pointer]
 
-    def max(self, numeric_only=None, **kwargs):
+    @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna")
+    def max(self, skipna=True):
         """
         The maximum value of the object.
 
@@ -2165,15 +2167,15 @@ def max(self, numeric_only=None, **kwargs):
         max : the maximum of this `Categorical`
         """
         self.check_for_ordered("max")
-        if numeric_only:
-            good = self._codes != -1
-            pointer = self._codes[good].max(**kwargs)
-        else:
-            pointer = self._codes.max(**kwargs)
-        if pointer == -1:
-            return np.nan
+        good = self._codes != -1
+        if not good.all():
+            if skipna:
+                pointer = self._codes[good].max()
+            else:
+                return np.nan
         else:
-            return self.categories[pointer]
+            pointer = self._codes.max()
+        return self.categories[pointer]
 
     def mode(self, dropna=True):
         """