diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 3a04789b609f8..d5250bc688826 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1220,6 +1220,7 @@ Deprecations - :func:`pandas.api.types.is_datetimetz` is deprecated in favor of `pandas.api.types.is_datetime64tz` (:issue:`23917`) - Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`) - Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). +- The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`) - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 1124000c97875..6e6d35f00725c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -4,6 +4,7 @@ from fractions import Fraction from numbers import Number import sys +import warnings import cython from cython import Py_ssize_t @@ -1079,7 +1080,7 @@ cdef _try_infer_map(v): return None -def infer_dtype(value: object, skipna: bool=False) -> str: +def infer_dtype(value: object, skipna: object=None) -> str: """ Efficiently infer the type of a passed val, or list-like array of values. Return a string describing the type. @@ -1088,8 +1089,7 @@ def infer_dtype(value: object, skipna: bool=False) -> str: ---------- value : scalar, list, ndarray, or pandas type skipna : bool, default False - Ignore NaN values when inferring the type. The default of ``False`` - will be deprecated in a later version of pandas. + Ignore NaN values when inferring the type. .. versionadded:: 0.21.0 @@ -1186,6 +1186,12 @@ def infer_dtype(value: object, skipna: bool=False) -> str: bint seen_pdnat = False bint seen_val = False + if skipna is None: + msg = ('A future version of pandas will default to `skipna=True`. To ' + 'silence this warning, pass `skipna=True|False` explicitly.') + warnings.warn(msg, FutureWarning, stacklevel=2) + skipna = False + if util.is_array(value): values = value elif hasattr(value, 'dtype'): diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py index 04842d82fca5d..9b2240eb62906 100644 --- a/pandas/core/arrays/array_.py +++ b/pandas/core/arrays/array_.py @@ -209,7 +209,7 @@ def array(data, # type: Sequence[object] return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: - inferred_dtype = lib.infer_dtype(data) + inferred_dtype = lib.infer_dtype(data, skipna=False) if inferred_dtype == 'period': try: return period_array(data, copy=copy) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 191cd5d63eea3..7861a122afdb6 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self): # object values are allowed to be merged elif ((lk_is_object and is_numeric_dtype(rk)) or (is_numeric_dtype(lk) and rk_is_object)): - inferred_left = lib.infer_dtype(lk) - inferred_right = lib.infer_dtype(rk) + inferred_left = lib.infer_dtype(lk, skipna=False) + inferred_right = lib.infer_dtype(rk, skipna=False) bool_types = ['integer', 'mixed-integer', 'boolean', 'empty'] string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty'] diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index cc2aa64b98c8b..f58cb362cd6d2 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -334,11 +334,11 @@ def test_infer_dtype_bytes(self): # string array of bytes arr = np.array(list('abc'), dtype='S1') - assert lib.infer_dtype(arr, skipna=False) == compare + assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes arr = arr.astype(object) - assert lib.infer_dtype(arr, skipna=False) == compare + assert lib.infer_dtype(arr, skipna=True) == compare # object array of bytes with missing values assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare @@ -538,32 +538,40 @@ def test_length_zero(self, skipna): def test_integers(self): arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'integer' arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='i4') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'integer' + def test_deprecation(self): + # GH 24050 + arr = np.array([1, 2, 3], dtype=object) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = lib.infer_dtype(arr) # default: skipna=None -> warn + assert result == 'integer' + def test_bools(self): arr = np.array([True, False, True, True, True], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([True, False, True, 'foo'], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed' arr = np.array([True, False, True], dtype=bool) - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'boolean' arr = np.array([True, np.nan, False], dtype='O') @@ -575,38 +583,38 @@ def test_bools(self): def test_floats(self): arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed-integer' arr = np.array([1, 2, 3, 4, 5], dtype='f4') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' arr = np.array([1, 2, 3, 4, 5], dtype='f8') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'floating' def test_decimals(self): # GH15690 arr = np.array([Decimal(1), Decimal(2), Decimal(3)]) - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([1.0, 2.0, Decimal(3)]) - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'mixed' arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)]) - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O') - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'decimal' def test_string(self): @@ -648,34 +656,34 @@ def test_infer_dtype_datetime(self): arr = np.array([Timestamp('2011-01-01'), Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([np.datetime64('2011-01-01'), np.datetime64('2011-01-01')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1)]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, pd.Timestamp('2011-01-02'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' arr = np.array([n, np.datetime64('2011-01-02'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime64' + assert lib.infer_dtype(arr, skipna=True) == 'datetime64' arr = np.array([n, datetime(2011, 1, 1), n]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' # different type of nat arr = np.array([np.timedelta64('nat'), @@ -689,58 +697,58 @@ def test_infer_dtype_datetime(self): # mixed datetime arr = np.array([datetime(2011, 1, 1), pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=False) == 'datetime' + assert lib.infer_dtype(arr, skipna=True) == 'datetime' # should be datetime? arr = np.array([np.datetime64('2011-01-01'), pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([pd.Timestamp('2011-01-02'), np.datetime64('2011-01-01')]) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) - assert lib.infer_dtype(arr, skipna=False) == 'mixed-integer' + assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer' arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) - assert lib.infer_dtype(arr, skipna=False) == 'mixed' + assert lib.infer_dtype(arr, skipna=True) == 'mixed' def test_infer_dtype_timedelta(self): arr = np.array([pd.Timedelta('1 days'), pd.Timedelta('2 days')]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([np.timedelta64(1, 'D'), np.timedelta64(2, 'D')], dtype=object) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([timedelta(1), timedelta(2)]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, Timedelta('1 days')]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D')]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1)]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, pd.Timedelta('1 days'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, np.timedelta64(1, 'D'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' arr = np.array([n, timedelta(1), n]) - assert lib.infer_dtype(arr, skipna=False) == 'timedelta' + assert lib.infer_dtype(arr, skipna=True) == 'timedelta' # different type of nat arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], @@ -755,19 +763,19 @@ def test_infer_dtype_period(self): # GH 13664 arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='D')]) - assert lib.infer_dtype(arr, skipna=False) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' arr = np.array([pd.Period('2011-01', freq='D'), pd.Period('2011-02', freq='M')]) - assert lib.infer_dtype(arr, skipna=False) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' # starts with nan for n in [pd.NaT, np.nan]: arr = np.array([n, pd.Period('2011-01', freq='D')]) - assert lib.infer_dtype(arr, skipna=False) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' arr = np.array([n, pd.Period('2011-01', freq='D'), n]) - assert lib.infer_dtype(arr, skipna=False) == 'period' + assert lib.infer_dtype(arr, skipna=True) == 'period' # different type of nat arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], @@ -846,7 +854,7 @@ def test_infer_datetimelike_array_nan_nat_like(self, first, second, def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) - assert lib.infer_dtype(arr, skipna=False) == 'floating' + assert lib.infer_dtype(arr, skipna=True) == 'floating' # nan and None mix are result in mixed arr = np.array([np.nan, np.nan, None]) @@ -1043,17 +1051,17 @@ def test_categorical(self): # GH 8974 from pandas import Categorical, Series arr = Categorical(list('abc')) - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'categorical' - result = lib.infer_dtype(Series(arr), skipna=False) + result = lib.infer_dtype(Series(arr), skipna=True) assert result == 'categorical' arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) - result = lib.infer_dtype(arr, skipna=False) + result = lib.infer_dtype(arr, skipna=True) assert result == 'categorical' - result = lib.infer_dtype(Series(arr), skipna=False) + result = lib.infer_dtype(Series(arr), skipna=True) assert result == 'categorical' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index f921d015fce3d..fa303c904440c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -813,12 +813,12 @@ def test_constructor_with_datetime_tz(self): s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) assert s.dtype == 'datetime64[ns, US/Pacific]' - assert lib.infer_dtype(s, skipna=False) == 'datetime64' + assert lib.infer_dtype(s, skipna=True) == 'datetime64' s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) assert s.dtype == 'object' - assert lib.infer_dtype(s, skipna=False) == 'datetime' + assert lib.infer_dtype(s, skipna=True) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')