Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: deprecate default of skipna=False in infer_dtype #24050

Merged
merged 19 commits into from
Jan 4, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ def clean_index_list(obj: list):
return obj, all_arrays

# don't force numpy coerce with nan's
inferred = infer_dtype(obj)
inferred = infer_dtype(obj, skipna=False)
jreback marked this conversation as resolved.
Show resolved Hide resolved
if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
return np.asarray(obj, dtype=object), 0
elif inferred in ['integer']:
Expand Down Expand Up @@ -1078,19 +1078,20 @@ cdef _try_infer_map(v):
return None


def infer_dtype(value: object, skipna: bool=False) -> str:
def infer_dtype(value: object, skipna: bool=True) -> str:
"""
Efficiently infer the type of a passed val, or list-like
array of values. Return a string describing the type.

Parameters
----------
value : scalar, list, ndarray, or pandas type
skipna : bool, default False
Ignore NaN values when inferring the type. The default of ``False``
will be deprecated in a later version of pandas.
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
skipna : bool, default True
Ignore NaN values when inferring the type.
jreback marked this conversation as resolved.
Show resolved Hide resolved

.. versionadded:: 0.21.0
.. versionchanged:: 0.24.0
Switched default of ``skipna`` to ``True``

Returns
-------
Expand Down Expand Up @@ -1209,6 +1210,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
values = construct_1d_object_array_from_listlike(value)

values = getattr(values, 'values', values)

jreback marked this conversation as resolved.
Show resolved Hide resolved
# make contiguous
values = values.ravel()

if skipna:
values = values[~isnaobj(values)]

Expand All @@ -1219,9 +1224,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
if values.dtype != np.object_:
values = values.astype('O')

# make contiguous
values = values.ravel()

n = len(values)
if n == 0:
return 'empty'
Expand Down
14 changes: 7 additions & 7 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,7 @@ def any_numpy_dtype(request):


# categoricals are handled separately
_any_skipna_inferred_dtype = [
_any_inferred_dtype = [
jreback marked this conversation as resolved.
Show resolved Hide resolved
('string', ['a', np.nan, 'c']),
('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
Expand All @@ -570,11 +570,11 @@ def any_numpy_dtype(request):
('time', [time(1), np.nan, time(2)]),
('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]),
('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])]
ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id
ids, _ = zip(*_any_inferred_dtype) # use inferred type as fixture-id


@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
def any_skipna_inferred_dtype(request):
@pytest.fixture(params=_any_inferred_dtype, ids=ids)
def any_inferred_dtype(request):
"""
Fixture for all inferred dtypes from _libs.lib.infer_dtype

Expand Down Expand Up @@ -610,10 +610,10 @@ def any_skipna_inferred_dtype(request):
--------
>>> import pandas._libs.lib as lib
>>>
>>> def test_something(any_skipna_inferred_dtype):
... inferred_dtype, values = any_skipna_inferred_dtype
>>> def test_something(any_inferred_dtype):
... inferred_dtype, values = any_inferred_dtype
... # will pass
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
... assert lib.infer_dtype(values) == inferred_dtype
"""
inferred_dtype, values = request.param
values = np.array(values, dtype=object) # object dtype to avoid casting
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def _get_hashtable_algo(values):
if ndtype == 'object':

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
if lib.infer_dtype(values, skipna=False) in ['string']:
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
ndtype = 'string'
else:
ndtype = 'object'
Expand All @@ -221,7 +221,7 @@ def _get_data_algo(values, func_map):
if ndtype == 'object':

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
if lib.infer_dtype(values, skipna=False) in ['string']:
ndtype = 'string'

f = func_map.get(ndtype, func_map['object'])
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def coerce_to_array(values, dtype, mask=None, copy=False):
values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values)
if inferred_type is 'mixed' and isna(values).all():
if inferred_type == 'empty':
values = np.empty(len(values))
values.fill(np.nan)
elif inferred_type not in ['floating', 'integer',
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,7 +940,7 @@ def try_timedelta(v):
# e.g. '00:00:01' is a timedelta but
# technically is also a datetime
value = try_timedelta(v)
if lib.infer_dtype(value) in ['mixed']:
if lib.infer_dtype(value, skipna=False) in ['mixed']:
jreback marked this conversation as resolved.
Show resolved Hide resolved
value = try_datetime(v)

return value
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
subarr = subarr.copy()

if dtype is None:
inferred = lib.infer_dtype(subarr)
inferred = lib.infer_dtype(subarr, skipna=False)
jreback marked this conversation as resolved.
Show resolved Hide resolved
if inferred == 'integer':
try:
return cls._try_convert_to_int_index(
Expand Down
23 changes: 7 additions & 16 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,27 +819,15 @@ def _harmonize_columns(self, parse_dates=None):
except KeyError:
pass # this column not in results

def _get_notna_col_dtype(self, col):
jreback marked this conversation as resolved.
Show resolved Hide resolved
"""
Infer datatype of the Series col. In case the dtype of col is 'object'
and it contains NA values, this infers the datatype of the not-NA
values. Needed for inserting typed data containing NULLs, GH8778.
"""
col_for_inference = col
if col.dtype == 'object':
notnadata = col[~isna(col)]
if len(notnadata):
col_for_inference = notnadata

return lib.infer_dtype(col_for_inference)

def _sqlalchemy_type(self, col):

dtype = self.dtype or {}
if col.name in dtype:
return self.dtype[col.name]

col_type = self._get_notna_col_dtype(col)
# Infer type of column, while ignoring missing values.
# Needed for inserting typed data containing NULLs, GH 8778.
col_type = lib.infer_dtype(col)

from sqlalchemy.types import (BigInteger, Integer, Float,
Text, Boolean,
Expand Down Expand Up @@ -1325,7 +1313,10 @@ def _sql_type_name(self, col):
if col.name in dtype:
return dtype[col.name]

col_type = self._get_notna_col_dtype(col)
# Infer type of column, while ignoring missing values.
# Needed for inserting typed data containing NULLs, GH 8778.
col_type = lib.infer_dtype(col)

if col_type == 'timedelta64':
warnings.warn("the 'timedelta' type is not supported, and will be "
"written as integer values (ns frequency) to the "
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1865,7 +1865,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114,
if force_strl:
return '%9s'
if dtype.type == np.object_:
inferred_dtype = infer_dtype(column.dropna())
jreback marked this conversation as resolved.
Show resolved Hide resolved
inferred_dtype = infer_dtype(column)
if not (inferred_dtype in ('string', 'unicode') or
len(column) == 0):
raise ValueError('Column `{col}` cannot be exported.\n\nOnly '
Expand Down
67 changes: 37 additions & 30 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,12 +496,12 @@ class TestTypeInference(object):
class Dummy():
pass

def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype):
def test_inferred_dtype_fixture(self, any_inferred_dtype):
# see pandas/conftest.py
inferred_dtype, values = any_skipna_inferred_dtype
inferred_dtype, values = any_inferred_dtype

# make sure the inferred dtype of the fixture is as requested
assert inferred_dtype == lib.infer_dtype(values, skipna=True)
assert inferred_dtype == lib.infer_dtype(values)

def test_length_zero(self):
result = lib.infer_dtype(np.array([], dtype='i4'))
Expand Down Expand Up @@ -547,9 +547,12 @@ def test_bools(self):
assert result == 'boolean'

arr = np.array([True, np.nan, False], dtype='O')
result = lib.infer_dtype(arr, skipna=True)
result = lib.infer_dtype(arr)
assert result == 'boolean'

result = lib.infer_dtype(arr, skipna=False)
assert result == 'mixed'

def test_floats(self):
arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
result = lib.infer_dtype(arr)
Expand Down Expand Up @@ -591,11 +594,11 @@ def test_string(self):

def test_unicode(self):
arr = [u'a', np.nan, u'c']
result = lib.infer_dtype(arr)
result = lib.infer_dtype(arr, skipna=False)
assert result == 'mixed'

arr = [u'a', np.nan, u'c']
result = lib.infer_dtype(arr, skipna=True)
result = lib.infer_dtype(arr)
expected = 'unicode' if PY2 else 'string'
assert result == expected

Expand Down Expand Up @@ -657,11 +660,11 @@ def test_infer_dtype_datetime(self):
# different type of nat
arr = np.array([np.timedelta64('nat'),
np.datetime64('2011-01-02')], dtype=object)
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

arr = np.array([np.datetime64('2011-01-02'),
np.timedelta64('nat')], dtype=object)
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

# mixed datetime
arr = np.array([datetime(2011, 1, 1),
Expand Down Expand Up @@ -722,11 +725,11 @@ def test_infer_dtype_timedelta(self):
# different type of nat
arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')],
dtype=object)
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')],
dtype=object)
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

def test_infer_dtype_period(self):
# GH 13664
Expand All @@ -749,11 +752,11 @@ def test_infer_dtype_period(self):
# different type of nat
arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')],
dtype=object)
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')],
dtype=object)
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

@pytest.mark.parametrize(
"data",
Expand Down Expand Up @@ -827,56 +830,58 @@ def test_infer_dtype_all_nan_nat_like(self):

# nan and None mix are result in mixed
arr = np.array([np.nan, np.nan, None])
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr) == 'empty'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

arr = np.array([None, np.nan, np.nan])
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr) == 'empty'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

# pd.NaT
arr = np.array([pd.NaT])
assert lib.infer_dtype(arr) == 'datetime'
assert lib.infer_dtype(arr, skipna=False) == 'datetime'
jreback marked this conversation as resolved.
Show resolved Hide resolved

arr = np.array([pd.NaT, np.nan])
assert lib.infer_dtype(arr) == 'datetime'
assert lib.infer_dtype(arr, skipna=False) == 'datetime'

arr = np.array([np.nan, pd.NaT])
assert lib.infer_dtype(arr) == 'datetime'
assert lib.infer_dtype(arr, skipna=False) == 'datetime'

arr = np.array([np.nan, pd.NaT, np.nan])
assert lib.infer_dtype(arr) == 'datetime'
assert lib.infer_dtype(arr, skipna=False) == 'datetime'

arr = np.array([None, pd.NaT, None])
assert lib.infer_dtype(arr) == 'datetime'
assert lib.infer_dtype(arr, skipna=False) == 'datetime'

# np.datetime64(nat)
arr = np.array([np.datetime64('nat')])
assert lib.infer_dtype(arr) == 'datetime64'
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

for n in [np.nan, pd.NaT, None]:
arr = np.array([n, np.datetime64('nat'), n])
assert lib.infer_dtype(arr) == 'datetime64'
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

arr = np.array([pd.NaT, n, np.datetime64('nat'), n])
assert lib.infer_dtype(arr) == 'datetime64'
assert lib.infer_dtype(arr, skipna=False) == 'datetime64'

arr = np.array([np.timedelta64('nat')], dtype=object)
assert lib.infer_dtype(arr) == 'timedelta'
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

for n in [np.nan, pd.NaT, None]:
arr = np.array([n, np.timedelta64('nat'), n])
assert lib.infer_dtype(arr) == 'timedelta'
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

arr = np.array([pd.NaT, n, np.timedelta64('nat'), n])
assert lib.infer_dtype(arr) == 'timedelta'
assert lib.infer_dtype(arr, skipna=False) == 'timedelta'

# datetime / timedelta mixed
arr = np.array([pd.NaT, np.datetime64('nat'),
np.timedelta64('nat'), np.nan])
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

arr = np.array([np.timedelta64('nat'), np.datetime64('nat')],
dtype=object)
assert lib.infer_dtype(arr) == 'mixed'
assert lib.infer_dtype(arr, skipna=False) == 'mixed'

def test_is_datetimelike_array_all_nan_nat_like(self):
arr = np.array([np.nan, pd.NaT, np.datetime64('nat')])
Expand Down Expand Up @@ -940,10 +945,10 @@ def test_date(self):
assert index.inferred_type == 'date'

dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan]
result = lib.infer_dtype(dates)
result = lib.infer_dtype(dates, skipna=False)
assert result == 'mixed'

result = lib.infer_dtype(dates, skipna=True)
result = lib.infer_dtype(dates)
assert result == 'date'

def test_is_numeric_array(self):
Expand Down Expand Up @@ -984,8 +989,10 @@ def test_object(self):
# GH 7431
# cannot infer more than this as only a single element
arr = np.array([None], dtype='O')
result = lib.infer_dtype(arr)
result = lib.infer_dtype(arr, skipna=False)
assert result == 'mixed'
result = lib.infer_dtype(arr)
assert result == 'empty'

def test_to_object_array_width(self):
# see gh-13320
Expand Down
Loading