Skip to content

Commit

Permalink
DEPR: deprecate default of skipna=False in infer_dtype (pandas-dev#24050
Browse files Browse the repository at this point in the history
)
  • Loading branch information
h-vetinari authored and Pingviinituutti committed Feb 28, 2019
1 parent af561c4 commit 0de9973
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 58 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,7 @@ Deprecations
- :func:`pandas.api.types.is_datetimetz` is deprecated in favor of `pandas.api.types.is_datetime64tz` (:issue:`23917`)
- Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`)
- Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
- The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`)
- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
- :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)

Expand Down
12 changes: 9 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ from fractions import Fraction
from numbers import Number

import sys
import warnings

import cython
from cython import Py_ssize_t
Expand Down Expand Up @@ -1079,7 +1080,7 @@ cdef _try_infer_map(v):
return None


def infer_dtype(value: object, skipna: bool=False) -> str:
def infer_dtype(value: object, skipna: object=None) -> str:
"""
Efficiently infer the type of a passed val, or list-like
array of values. Return a string describing the type.
Expand All @@ -1088,8 +1089,7 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
----------
value : scalar, list, ndarray, or pandas type
skipna : bool, default False
Ignore NaN values when inferring the type. The default of ``False``
will be deprecated in a later version of pandas.
Ignore NaN values when inferring the type.

.. versionadded:: 0.21.0

Expand Down Expand Up @@ -1186,6 +1186,12 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
bint seen_pdnat = False
bint seen_val = False

if skipna is None:
msg = ('A future version of pandas will default to `skipna=True`. To '
'silence this warning, pass `skipna=True|False` explicitly.')
warnings.warn(msg, FutureWarning, stacklevel=2)
skipna = False

if util.is_array(value):
values = value
elif hasattr(value, 'dtype'):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/array_.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def array(data, # type: Sequence[object]
return cls._from_sequence(data, dtype=dtype, copy=copy)

if dtype is None:
inferred_dtype = lib.infer_dtype(data)
inferred_dtype = lib.infer_dtype(data, skipna=False)
if inferred_dtype == 'period':
try:
return period_array(data, copy=copy)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self):
# object values are allowed to be merged
elif ((lk_is_object and is_numeric_dtype(rk)) or
(is_numeric_dtype(lk) and rk_is_object)):
inferred_left = lib.infer_dtype(lk)
inferred_right = lib.infer_dtype(rk)
inferred_left = lib.infer_dtype(lk, skipna=False)
inferred_right = lib.infer_dtype(rk, skipna=False)
bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']

Expand Down
108 changes: 58 additions & 50 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,11 @@ def test_infer_dtype_bytes(self):

# string array of bytes
arr = np.array(list('abc'), dtype='S1')
assert lib.infer_dtype(arr, skipna=False) == compare
assert lib.infer_dtype(arr, skipna=True) == compare

# object array of bytes
arr = arr.astype(object)
assert lib.infer_dtype(arr, skipna=False) == compare
assert lib.infer_dtype(arr, skipna=True) == compare

# object array of bytes with missing values
assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare
Expand Down Expand Up @@ -538,32 +538,40 @@ def test_length_zero(self, skipna):

def test_integers(self):
arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'integer'

arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed-integer'

arr = np.array([1, 2, 3, 4, 5], dtype='i4')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'integer'

def test_deprecation(self):
# GH 24050
arr = np.array([1, 2, 3], dtype=object)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = lib.infer_dtype(arr) # default: skipna=None -> warn
assert result == 'integer'

def test_bools(self):
arr = np.array([True, False, True, True, True], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'boolean'

arr = np.array([np.bool_(True), np.bool_(False)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'boolean'

arr = np.array([True, False, True, 'foo'], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed'

arr = np.array([True, False, True], dtype=bool)
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'boolean'

arr = np.array([True, np.nan, False], dtype='O')
Expand All @@ -575,38 +583,38 @@ def test_bools(self):

def test_floats(self):
arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'floating'

arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'],
dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed-integer'

arr = np.array([1, 2, 3, 4, 5], dtype='f4')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'floating'

arr = np.array([1, 2, 3, 4, 5], dtype='f8')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'floating'

def test_decimals(self):
# GH15690
arr = np.array([Decimal(1), Decimal(2), Decimal(3)])
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'decimal'

arr = np.array([1.0, 2.0, Decimal(3)])
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed'

arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)])
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'decimal'

arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'decimal'

def test_string(self):
Expand Down Expand Up @@ -648,34 +656,34 @@ def test_infer_dtype_datetime(self):

arr = np.array([Timestamp('2011-01-01'),
Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([np.datetime64('2011-01-01'),
np.datetime64('2011-01-01')], dtype=object)
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

# starts with nan
for n in [pd.NaT, np.nan]:
arr = np.array([n, pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([n, np.datetime64('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

arr = np.array([n, datetime(2011, 1, 1)])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([n, pd.Timestamp('2011-01-02'), n])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([n, np.datetime64('2011-01-02'), n])
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

arr = np.array([n, datetime(2011, 1, 1), n])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

# different type of nat
arr = np.array([np.timedelta64('nat'),
Expand All @@ -689,58 +697,58 @@ def test_infer_dtype_datetime(self):
# mixed datetime
arr = np.array([datetime(2011, 1, 1),
pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

# should be datetime?
arr = np.array([np.datetime64('2011-01-01'),
pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

arr = np.array([pd.Timestamp('2011-01-02'),
np.datetime64('2011-01-01')])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1])
assert lib.infer_dtype(arr, skipna=False) == 'mixed-integer'
assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer'

arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

def test_infer_dtype_timedelta(self):

arr = np.array([pd.Timedelta('1 days'),
pd.Timedelta('2 days')])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([np.timedelta64(1, 'D'),
np.timedelta64(2, 'D')], dtype=object)
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([timedelta(1), timedelta(2)])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

# starts with nan
for n in [pd.NaT, np.nan]:
arr = np.array([n, Timedelta('1 days')])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, np.timedelta64(1, 'D')])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, timedelta(1)])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, pd.Timedelta('1 days'), n])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, np.timedelta64(1, 'D'), n])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, timedelta(1), n])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

# different type of nat
arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')],
Expand All @@ -755,19 +763,19 @@ def test_infer_dtype_period(self):
# GH 13664
arr = np.array([pd.Period('2011-01', freq='D'),
pd.Period('2011-02', freq='D')])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

arr = np.array([pd.Period('2011-01', freq='D'),
pd.Period('2011-02', freq='M')])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

# starts with nan
for n in [pd.NaT, np.nan]:
arr = np.array([n, pd.Period('2011-01', freq='D')])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

arr = np.array([n, pd.Period('2011-01', freq='D'), n])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

# different type of nat
arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')],
Expand Down Expand Up @@ -846,7 +854,7 @@ def test_infer_datetimelike_array_nan_nat_like(self, first, second,

def test_infer_dtype_all_nan_nat_like(self):
arr = np.array([np.nan, np.nan])
assert lib.infer_dtype(arr, skipna=False) == 'floating'
assert lib.infer_dtype(arr, skipna=True) == 'floating'

# nan and None mix are result in mixed
arr = np.array([np.nan, np.nan, None])
Expand Down Expand Up @@ -1043,17 +1051,17 @@ def test_categorical(self):
# GH 8974
from pandas import Categorical, Series
arr = Categorical(list('abc'))
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'categorical'

result = lib.infer_dtype(Series(arr), skipna=False)
result = lib.infer_dtype(Series(arr), skipna=True)
assert result == 'categorical'

arr = Categorical(list('abc'), categories=['cegfab'], ordered=True)
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'categorical'

result = lib.infer_dtype(Series(arr), skipna=False)
result = lib.infer_dtype(Series(arr), skipna=True)
assert result == 'categorical'


Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,12 +813,12 @@ def test_constructor_with_datetime_tz(self):
s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
assert s.dtype == 'datetime64[ns, US/Pacific]'
assert lib.infer_dtype(s, skipna=False) == 'datetime64'
assert lib.infer_dtype(s, skipna=True) == 'datetime64'

s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
assert s.dtype == 'object'
assert lib.infer_dtype(s, skipna=False) == 'datetime'
assert lib.infer_dtype(s, skipna=True) == 'datetime'

# with all NaT
s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
Expand Down

0 comments on commit 0de9973

Please sign in to comment.