Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: deprecate default of skipna=False in infer_dtype #24050

Merged
merged 19 commits into from
Jan 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,7 @@ Deprecations
- :func:`pandas.api.types.is_datetimetz` is deprecated in favor of `pandas.api.types.is_datetime64tz` (:issue:`23917`)
- Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`)
- Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
- The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`)
- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
- :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)

Expand Down
12 changes: 9 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ from fractions import Fraction
from numbers import Number

import sys
import warnings

import cython
from cython import Py_ssize_t
Expand Down Expand Up @@ -1079,7 +1080,7 @@ cdef _try_infer_map(v):
return None


def infer_dtype(value: object, skipna: bool=False) -> str:
def infer_dtype(value: object, skipna: object=None) -> str:
"""
Efficiently infer the type of a passed val, or list-like
array of values. Return a string describing the type.
Expand All @@ -1088,8 +1089,7 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
----------
value : scalar, list, ndarray, or pandas type
skipna : bool, default False
Ignore NaN values when inferring the type. The default of ``False``
will be deprecated in a later version of pandas.
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
Ignore NaN values when inferring the type.
jreback marked this conversation as resolved.
Show resolved Hide resolved

.. versionadded:: 0.21.0

Expand Down Expand Up @@ -1186,6 +1186,12 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
bint seen_pdnat = False
bint seen_val = False

if skipna is None:
msg = ('A future version of pandas will default to `skipna=True`. To '
'silence this warning, pass `skipna=True|False` explicitly.')
warnings.warn(msg, FutureWarning, stacklevel=2)
skipna = False

if util.is_array(value):
values = value
elif hasattr(value, 'dtype'):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/array_.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def array(data, # type: Sequence[object]
return cls._from_sequence(data, dtype=dtype, copy=copy)

if dtype is None:
inferred_dtype = lib.infer_dtype(data)
inferred_dtype = lib.infer_dtype(data, skipna=False)
if inferred_dtype == 'period':
try:
return period_array(data, copy=copy)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self):
# object values are allowed to be merged
elif ((lk_is_object and is_numeric_dtype(rk)) or
(is_numeric_dtype(lk) and rk_is_object)):
inferred_left = lib.infer_dtype(lk)
inferred_right = lib.infer_dtype(rk)
inferred_left = lib.infer_dtype(lk, skipna=False)
inferred_right = lib.infer_dtype(rk, skipna=False)
bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']

Expand Down
108 changes: 58 additions & 50 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,11 @@ def test_infer_dtype_bytes(self):

# string array of bytes
arr = np.array(list('abc'), dtype='S1')
assert lib.infer_dtype(arr, skipna=False) == compare
assert lib.infer_dtype(arr, skipna=True) == compare

# object array of bytes
arr = arr.astype(object)
assert lib.infer_dtype(arr, skipna=False) == compare
assert lib.infer_dtype(arr, skipna=True) == compare

# object array of bytes with missing values
assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare
Expand Down Expand Up @@ -538,32 +538,40 @@ def test_length_zero(self, skipna):

def test_integers(self):
arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'integer'

arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed-integer'

arr = np.array([1, 2, 3, 4, 5], dtype='i4')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'integer'

def test_deprecation(self):
# GH 24050
arr = np.array([1, 2, 3], dtype=object)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = lib.infer_dtype(arr) # default: skipna=None -> warn
assert result == 'integer'

def test_bools(self):
arr = np.array([True, False, True, True, True], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'boolean'

arr = np.array([np.bool_(True), np.bool_(False)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'boolean'

arr = np.array([True, False, True, 'foo'], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed'

arr = np.array([True, False, True], dtype=bool)
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'boolean'

arr = np.array([True, np.nan, False], dtype='O')
Expand All @@ -575,38 +583,38 @@ def test_bools(self):

def test_floats(self):
arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'floating'

arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'],
dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed-integer'

arr = np.array([1, 2, 3, 4, 5], dtype='f4')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'floating'

arr = np.array([1, 2, 3, 4, 5], dtype='f8')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'floating'

def test_decimals(self):
# GH15690
arr = np.array([Decimal(1), Decimal(2), Decimal(3)])
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'decimal'

arr = np.array([1.0, 2.0, Decimal(3)])
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'mixed'

arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)])
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'decimal'

arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O')
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'decimal'

def test_string(self):
Expand Down Expand Up @@ -648,34 +656,34 @@ def test_infer_dtype_datetime(self):

arr = np.array([Timestamp('2011-01-01'),
Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([np.datetime64('2011-01-01'),
np.datetime64('2011-01-01')], dtype=object)
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

# starts with nan
for n in [pd.NaT, np.nan]:
arr = np.array([n, pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([n, np.datetime64('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

arr = np.array([n, datetime(2011, 1, 1)])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([n, pd.Timestamp('2011-01-02'), n])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

arr = np.array([n, np.datetime64('2011-01-02'), n])
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
assert lib.infer_dtype(arr, skipna=True) == 'datetime64'

arr = np.array([n, datetime(2011, 1, 1), n])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

# different type of nat
arr = np.array([np.timedelta64('nat'),
Expand All @@ -689,58 +697,58 @@ def test_infer_dtype_datetime(self):
# mixed datetime
arr = np.array([datetime(2011, 1, 1),
pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
assert lib.infer_dtype(arr, skipna=True) == 'datetime'

# should be datetime?
arr = np.array([np.datetime64('2011-01-01'),
pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

arr = np.array([pd.Timestamp('2011-01-02'),
np.datetime64('2011-01-01')])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1])
assert lib.infer_dtype(arr, skipna=False) == 'mixed-integer'
assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer'

arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')])
assert lib.infer_dtype(arr, skipna=False) == 'mixed'
assert lib.infer_dtype(arr, skipna=True) == 'mixed'

def test_infer_dtype_timedelta(self):

arr = np.array([pd.Timedelta('1 days'),
pd.Timedelta('2 days')])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([np.timedelta64(1, 'D'),
np.timedelta64(2, 'D')], dtype=object)
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([timedelta(1), timedelta(2)])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

# starts with nan
for n in [pd.NaT, np.nan]:
arr = np.array([n, Timedelta('1 days')])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, np.timedelta64(1, 'D')])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, timedelta(1)])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, pd.Timedelta('1 days'), n])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, np.timedelta64(1, 'D'), n])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

arr = np.array([n, timedelta(1), n])
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
assert lib.infer_dtype(arr, skipna=True) == 'timedelta'

# different type of nat
arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')],
Expand All @@ -755,19 +763,19 @@ def test_infer_dtype_period(self):
# GH 13664
arr = np.array([pd.Period('2011-01', freq='D'),
pd.Period('2011-02', freq='D')])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

arr = np.array([pd.Period('2011-01', freq='D'),
pd.Period('2011-02', freq='M')])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

# starts with nan
for n in [pd.NaT, np.nan]:
arr = np.array([n, pd.Period('2011-01', freq='D')])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

arr = np.array([n, pd.Period('2011-01', freq='D'), n])
assert lib.infer_dtype(arr, skipna=False) == 'period'
assert lib.infer_dtype(arr, skipna=True) == 'period'

# different type of nat
arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')],
Expand Down Expand Up @@ -846,7 +854,7 @@ def test_infer_datetimelike_array_nan_nat_like(self, first, second,

def test_infer_dtype_all_nan_nat_like(self):
arr = np.array([np.nan, np.nan])
assert lib.infer_dtype(arr, skipna=False) == 'floating'
assert lib.infer_dtype(arr, skipna=True) == 'floating'

# nan and None mix are result in mixed
arr = np.array([np.nan, np.nan, None])
Expand Down Expand Up @@ -1043,17 +1051,17 @@ def test_categorical(self):
# GH 8974
from pandas import Categorical, Series
arr = Categorical(list('abc'))
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'categorical'

result = lib.infer_dtype(Series(arr), skipna=False)
result = lib.infer_dtype(Series(arr), skipna=True)
assert result == 'categorical'

arr = Categorical(list('abc'), categories=['cegfab'], ordered=True)
result = lib.infer_dtype(arr, skipna=False)
result = lib.infer_dtype(arr, skipna=True)
assert result == 'categorical'

result = lib.infer_dtype(Series(arr), skipna=False)
result = lib.infer_dtype(Series(arr), skipna=True)
assert result == 'categorical'


Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,12 +813,12 @@ def test_constructor_with_datetime_tz(self):
s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
assert s.dtype == 'datetime64[ns, US/Pacific]'
assert lib.infer_dtype(s, skipna=False) == 'datetime64'
assert lib.infer_dtype(s, skipna=True) == 'datetime64'

s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
assert s.dtype == 'object'
assert lib.infer_dtype(s, skipna=False) == 'datetime'
assert lib.infer_dtype(s, skipna=True) == 'datetime'

# with all NaT
s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')
Expand Down