pandas-dev · jreback · Jan 4, 2019 · Dec 2, 2018 · Dec 5, 2018 · Dec 6, 2018
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -622,7 +622,7 @@ def clean_index_list(obj: list):
         return obj, all_arrays
 
     # don't force numpy coerce with nan's
-    inferred = infer_dtype(obj)
+    inferred = infer_dtype(obj, skipna=False)
     if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
         return np.asarray(obj, dtype=object), 0
     elif inferred in ['integer']:
@@ -1078,19 +1078,20 @@ cdef _try_infer_map(v):
     return None
 
 
-def infer_dtype(value: object, skipna: bool=False) -> str:
+def infer_dtype(value: object, skipna: bool=True) -> str:
     """
     Efficiently infer the type of a passed val, or list-like
     array of values. Return a string describing the type.
 
     Parameters
     ----------
     value : scalar, list, ndarray, or pandas type
-    skipna : bool, default False
-        Ignore NaN values when inferring the type. The default of ``False``
-        will be deprecated in a later version of pandas.
+    skipna : bool, default True
+        Ignore NaN values when inferring the type.
 
         .. versionadded:: 0.21.0
+        .. versionchanged:: 0.24.0
+            Switched default of ``skipna`` to ``True``
 
     Returns
     -------
@@ -1209,6 +1210,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
         values = construct_1d_object_array_from_listlike(value)
 
     values = getattr(values, 'values', values)
+
+    # make contiguous
+    values = values.ravel()
+
     if skipna:
         values = values[~isnaobj(values)]
 
@@ -1219,9 +1224,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
     if values.dtype != np.object_:
         values = values.astype('O')
 
-    # make contiguous
-    values = values.ravel()
-
     n = len(values)
     if n == 0:
         return 'empty'

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -545,7 +545,7 @@ def any_numpy_dtype(request):
 
 
 # categoricals are handled separately
-_any_skipna_inferred_dtype = [
+_any_inferred_dtype = [
     ('string', ['a', np.nan, 'c']),
     ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
     ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
@@ -570,11 +570,11 @@ def any_numpy_dtype(request):
     ('time', [time(1), np.nan, time(2)]),
     ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]),
     ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])]
-ids, _ = zip(*_any_skipna_inferred_dtype)  # use inferred type as fixture-id
+ids, _ = zip(*_any_inferred_dtype)  # use inferred type as fixture-id
 
 
-@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
-def any_skipna_inferred_dtype(request):
+@pytest.fixture(params=_any_inferred_dtype, ids=ids)
+def any_inferred_dtype(request):
     """
     Fixture for all inferred dtypes from _libs.lib.infer_dtype
 
@@ -610,10 +610,10 @@ def any_skipna_inferred_dtype(request):
     --------
     >>> import pandas._libs.lib as lib
     >>>
-    >>> def test_something(any_skipna_inferred_dtype):
-    ...     inferred_dtype, values = any_skipna_inferred_dtype
+    >>> def test_something(any_inferred_dtype):
+    ...     inferred_dtype, values = any_inferred_dtype
     ...     # will pass
-    ...     assert lib.infer_dtype(values, skipna=True) == inferred_dtype
+    ...     assert lib.infer_dtype(values) == inferred_dtype
     """
     inferred_dtype, values = request.param
     values = np.array(values, dtype=object)  # object dtype to avoid casting

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -203,7 +203,7 @@ def _get_hashtable_algo(values):
     if ndtype == 'object':
 
         # its cheaper to use a String Hash Table than Object
-        if lib.infer_dtype(values) in ['string']:
+        if lib.infer_dtype(values, skipna=False) in ['string']:
             ndtype = 'string'
         else:
             ndtype = 'object'
@@ -221,7 +221,7 @@ def _get_data_algo(values, func_map):
     if ndtype == 'object':
 
         # its cheaper to use a String Hash Table than Object
-        if lib.infer_dtype(values) in ['string']:
+        if lib.infer_dtype(values, skipna=False) in ['string']:
             ndtype = 'string'
 
     f = func_map.get(ndtype, func_map['object'])

diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py
@@ -169,7 +169,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
     values = np.array(values, copy=copy)
     if is_object_dtype(values):
         inferred_type = lib.infer_dtype(values)
-        if inferred_type is 'mixed' and isna(values).all():
+        if inferred_type == 'empty':
             values = np.empty(len(values))
             values.fill(np.nan)
         elif inferred_type not in ['floating', 'integer',

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -940,7 +940,7 @@ def try_timedelta(v):
             # e.g. '00:00:01' is a timedelta but
             # technically is also a datetime
             value = try_timedelta(v)
-            if lib.infer_dtype(value) in ['mixed']:
+            if lib.infer_dtype(value, skipna=False) in ['mixed']:
                 value = try_datetime(v)
 
     return value

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -409,7 +409,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                 subarr = subarr.copy()
 
             if dtype is None:
-                inferred = lib.infer_dtype(subarr)
+                inferred = lib.infer_dtype(subarr, skipna=False)
                 if inferred == 'integer':
                     try:
                         return cls._try_convert_to_int_index(

diff --git a/pandas/io/sql.py b/pandas/io/sql.py
@@ -819,27 +819,15 @@ def _harmonize_columns(self, parse_dates=None):
             except KeyError:
                 pass  # this column not in results
 
-    def _get_notna_col_dtype(self, col):
-        """
-        Infer datatype of the Series col.  In case the dtype of col is 'object'
-        and it contains NA values, this infers the datatype of the not-NA
-        values.  Needed for inserting typed data containing NULLs, GH8778.
-        """
-        col_for_inference = col
-        if col.dtype == 'object':
-            notnadata = col[~isna(col)]
-            if len(notnadata):
-                col_for_inference = notnadata
-
-        return lib.infer_dtype(col_for_inference)
-
     def _sqlalchemy_type(self, col):
 
         dtype = self.dtype or {}
         if col.name in dtype:
             return self.dtype[col.name]
 
-        col_type = self._get_notna_col_dtype(col)
+        # Infer type of column, while ignoring missing values.
+        # Needed for inserting typed data containing NULLs, GH 8778.
+        col_type = lib.infer_dtype(col)
 
         from sqlalchemy.types import (BigInteger, Integer, Float,
                                       Text, Boolean,
@@ -1325,7 +1313,10 @@ def _sql_type_name(self, col):
         if col.name in dtype:
             return dtype[col.name]
 
-        col_type = self._get_notna_col_dtype(col)
+        # Infer type of column, while ignoring missing values.
+        # Needed for inserting typed data containing NULLs, GH 8778.
+        col_type = lib.infer_dtype(col)
+
         if col_type == 'timedelta64':
             warnings.warn("the 'timedelta' type is not supported, and will be "
                           "written as integer values (ns frequency) to the "

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1865,7 +1865,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114,
         if force_strl:
             return '%9s'
     if dtype.type == np.object_:
-        inferred_dtype = infer_dtype(column.dropna())
+        inferred_dtype = infer_dtype(column)
         if not (inferred_dtype in ('string', 'unicode') or
                 len(column) == 0):
             raise ValueError('Column `{col}` cannot be exported.\n\nOnly '

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -496,12 +496,12 @@ class TestTypeInference(object):
     class Dummy():
         pass
 
-    def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype):
+    def test_inferred_dtype_fixture(self, any_inferred_dtype):
         # see pandas/conftest.py
-        inferred_dtype, values = any_skipna_inferred_dtype
+        inferred_dtype, values = any_inferred_dtype
 
         # make sure the inferred dtype of the fixture is as requested
-        assert inferred_dtype == lib.infer_dtype(values, skipna=True)
+        assert inferred_dtype == lib.infer_dtype(values)
 
     def test_length_zero(self):
         result = lib.infer_dtype(np.array([], dtype='i4'))
@@ -547,9 +547,12 @@ def test_bools(self):
         assert result == 'boolean'
 
         arr = np.array([True, np.nan, False], dtype='O')
-        result = lib.infer_dtype(arr, skipna=True)
+        result = lib.infer_dtype(arr)
         assert result == 'boolean'
 
+        result = lib.infer_dtype(arr, skipna=False)
+        assert result == 'mixed'
+
     def test_floats(self):
         arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
         result = lib.infer_dtype(arr)
@@ -591,11 +594,11 @@ def test_string(self):
 
     def test_unicode(self):
         arr = [u'a', np.nan, u'c']
-        result = lib.infer_dtype(arr)
+        result = lib.infer_dtype(arr, skipna=False)
         assert result == 'mixed'
 
         arr = [u'a', np.nan, u'c']
-        result = lib.infer_dtype(arr, skipna=True)
+        result = lib.infer_dtype(arr)
         expected = 'unicode' if PY2 else 'string'
         assert result == expected
 
@@ -657,11 +660,11 @@ def test_infer_dtype_datetime(self):
         # different type of nat
         arr = np.array([np.timedelta64('nat'),
                         np.datetime64('2011-01-02')], dtype=object)
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
         arr = np.array([np.datetime64('2011-01-02'),
                         np.timedelta64('nat')], dtype=object)
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
         # mixed datetime
         arr = np.array([datetime(2011, 1, 1),
@@ -722,11 +725,11 @@ def test_infer_dtype_timedelta(self):
         # different type of nat
         arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')],
                        dtype=object)
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
         arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')],
                        dtype=object)
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
     def test_infer_dtype_period(self):
         # GH 13664
@@ -749,11 +752,11 @@ def test_infer_dtype_period(self):
         # different type of nat
         arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')],
                        dtype=object)
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
         arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')],
                        dtype=object)
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
     @pytest.mark.parametrize(
         "data",
@@ -827,56 +830,58 @@ def test_infer_dtype_all_nan_nat_like(self):
 
         # nan and None mix are result in mixed
         arr = np.array([np.nan, np.nan, None])
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr) == 'empty'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
         arr = np.array([None, np.nan, np.nan])
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr) == 'empty'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
         # pd.NaT
         arr = np.array([pd.NaT])
-        assert lib.infer_dtype(arr) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
 
         arr = np.array([pd.NaT, np.nan])
-        assert lib.infer_dtype(arr) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
 
         arr = np.array([np.nan, pd.NaT])
-        assert lib.infer_dtype(arr) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
 
         arr = np.array([np.nan, pd.NaT, np.nan])
-        assert lib.infer_dtype(arr) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
 
         arr = np.array([None, pd.NaT, None])
-        assert lib.infer_dtype(arr) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
 
         # np.datetime64(nat)
         arr = np.array([np.datetime64('nat')])
-        assert lib.infer_dtype(arr) == 'datetime64'
+        assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
 
         for n in [np.nan, pd.NaT, None]:
             arr = np.array([n, np.datetime64('nat'), n])
-            assert lib.infer_dtype(arr) == 'datetime64'
+            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
 
             arr = np.array([pd.NaT, n, np.datetime64('nat'), n])
-            assert lib.infer_dtype(arr) == 'datetime64'
+            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
 
         arr = np.array([np.timedelta64('nat')], dtype=object)
-        assert lib.infer_dtype(arr) == 'timedelta'
+        assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
 
         for n in [np.nan, pd.NaT, None]:
             arr = np.array([n, np.timedelta64('nat'), n])
-            assert lib.infer_dtype(arr) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
 
             arr = np.array([pd.NaT, n, np.timedelta64('nat'), n])
-            assert lib.infer_dtype(arr) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
 
         # datetime / timedelta mixed
         arr = np.array([pd.NaT, np.datetime64('nat'),
                         np.timedelta64('nat'), np.nan])
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
         arr = np.array([np.timedelta64('nat'), np.datetime64('nat')],
                        dtype=object)
-        assert lib.infer_dtype(arr) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
 
     def test_is_datetimelike_array_all_nan_nat_like(self):
         arr = np.array([np.nan, pd.NaT, np.datetime64('nat')])
@@ -940,10 +945,10 @@ def test_date(self):
         assert index.inferred_type == 'date'
 
         dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan]
-        result = lib.infer_dtype(dates)
+        result = lib.infer_dtype(dates, skipna=False)
         assert result == 'mixed'
 
-        result = lib.infer_dtype(dates, skipna=True)
+        result = lib.infer_dtype(dates)
         assert result == 'date'
 
     def test_is_numeric_array(self):
@@ -984,8 +989,10 @@ def test_object(self):
         # GH 7431
         # cannot infer more than this as only a single element
         arr = np.array([None], dtype='O')
-        result = lib.infer_dtype(arr)
+        result = lib.infer_dtype(arr, skipna=False)
         assert result == 'mixed'
+        result = lib.infer_dtype(arr)
+        assert result == 'empty'
 
     def test_to_object_array_width(self):
         # see gh-13320