Skip to content

Commit

Permalink
ENH: Make maybe_convert_object respect dtype itemsize (#40908)
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach committed Apr 21, 2021
1 parent 1749a81 commit 549e39b
Show file tree
Hide file tree
Showing 10 changed files with 142 additions and 45 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ Other enhancements
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
-

Expand Down Expand Up @@ -691,7 +692,7 @@ Numeric
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
-
- Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`)

Conversion
^^^^^^^^^^
Expand Down
88 changes: 61 additions & 27 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ cdef extern from "numpy/arrayobject.h":
object fields
tuple names

cdef extern from "numpy/ndarrayobject.h":
bint PyArray_CheckScalar(obj) nogil


cdef extern from "src/parse_helper.h":
int floatify(object, float64_t *result, int *maybe_int) except -1
Expand Down Expand Up @@ -209,6 +212,24 @@ def is_scalar(val: object) -> bool:
or is_offset_object(val))


cdef inline int64_t get_itemsize(object val):
"""
Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
Parameters
----------
val : object
Returns
-------
is_ndarray : bool
"""
if PyArray_CheckScalar(val):
return cnp.PyArray_DescrFromScalar(val).itemsize
else:
return -1


def is_iterator(obj: object) -> bool:
"""
Check if the object is an iterator.
Expand Down Expand Up @@ -2188,7 +2209,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,

Parameters
----------
values : ndarray[object]
objects : ndarray[object]
Array of object elements to convert.
try_float : bool, default False
If an array-like object contains only float or NaN values is
Expand All @@ -2212,7 +2233,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
Array of converted object values to more specific dtypes if applicable.
"""
cdef:
Py_ssize_t i, n
Py_ssize_t i, n, itemsize_max = 0
ndarray[float64_t] floats
ndarray[complex128_t] complexes
ndarray[int64_t] ints
Expand Down Expand Up @@ -2245,6 +2266,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,

for i in range(n):
val = objects[i]
if itemsize_max != -1:
itemsize = get_itemsize(val)
if itemsize > itemsize_max or itemsize == -1:
itemsize_max = itemsize

if val is None:
seen.null_ = True
Expand Down Expand Up @@ -2346,92 +2371,101 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
seen.object_ = True

if not seen.object_:
result = None
if not safe:
if seen.null_ or seen.nan_:
if seen.is_float_or_complex:
if seen.complex_:
return complexes
result = complexes
elif seen.float_:
return floats
result = floats
elif seen.int_:
if convert_to_nullable_integer:
from pandas.core.arrays import IntegerArray
return IntegerArray(ints, mask)
result = IntegerArray(ints, mask)
else:
return floats
result = floats
elif seen.nan_:
return floats
result = floats
else:
if not seen.bool_:
if seen.datetime_:
if not seen.numeric_ and not seen.timedelta_:
return datetimes
result = datetimes
elif seen.timedelta_:
if not seen.numeric_:
return timedeltas
result = timedeltas
elif seen.nat_:
if not seen.numeric_:
if convert_datetime and convert_timedelta:
# TODO: array full of NaT ambiguity resolve here needed
pass
elif convert_datetime:
return datetimes
result = datetimes
elif convert_timedelta:
return timedeltas
result = timedeltas
else:
if seen.complex_:
return complexes
result = complexes
elif seen.float_:
return floats
result = floats
elif seen.int_:
if seen.uint_:
return uints
result = uints
else:
return ints
result = ints
elif seen.is_bool:
return bools.view(np.bool_)
result = bools.view(np.bool_)

else:
# don't cast int to float, etc.
if seen.null_:
if seen.is_float_or_complex:
if seen.complex_:
if not seen.int_:
return complexes
result = complexes
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
result = floats
else:
if not seen.bool_:
if seen.datetime_:
if not seen.numeric_ and not seen.timedelta_:
return datetimes
result = datetimes
elif seen.timedelta_:
if not seen.numeric_:
return timedeltas
result = timedeltas
elif seen.nat_:
if not seen.numeric_:
if convert_datetime and convert_timedelta:
# TODO: array full of NaT ambiguity resolve here needed
pass
elif convert_datetime:
return datetimes
result = datetimes
elif convert_timedelta:
return timedeltas
result = timedeltas
else:
if seen.complex_:
if not seen.int_:
return complexes
result = complexes
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
result = floats
elif seen.int_:
if seen.uint_:
return uints
result = uints
else:
return ints
result = ints
elif seen.is_bool and not seen.nan_:
return bools.view(np.bool_)
result = bools.view(np.bool_)

if result is uints or result is ints or result is floats or result is complexes:
# cast to the largest itemsize when all values are NumPy scalars
if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
result = result.astype(result.dtype.kind + str(itemsize_max))
return result
elif result is not None:
return result

return objects

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1891,7 +1891,7 @@ def count(self, level=None):
2
"""
if level is None:
return notna(self._values).sum()
return notna(self._values).sum().astype("int64")
else:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
Expand Down
64 changes: 64 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from pandas.core.dtypes.common import (
ensure_int32,
is_bool,
is_complex,
is_datetime64_any_dtype,
is_datetime64_dtype,
is_datetime64_ns_dtype,
Expand Down Expand Up @@ -614,6 +615,69 @@ def test_maybe_convert_objects_bool_nan(self):
out = lib.maybe_convert_objects(ind.values, safe=1)
tm.assert_numpy_array_equal(out, exp)

@pytest.mark.parametrize(
"data0",
[
True,
1,
1.0,
1.0 + 1.0j,
np.int8(1),
np.int16(1),
np.int32(1),
np.int64(1),
np.float16(1),
np.float32(1),
np.float64(1),
np.complex64(1),
np.complex128(1),
],
)
@pytest.mark.parametrize(
"data1",
[
True,
1,
1.0,
1.0 + 1.0j,
np.int8(1),
np.int16(1),
np.int32(1),
np.int64(1),
np.float16(1),
np.float32(1),
np.float64(1),
np.complex64(1),
np.complex128(1),
],
)
def test_maybe_convert_objects_itemsize(self, data0, data1):
# GH 40908
data = [data0, data1]
arr = np.array(data, dtype="object")

common_kind = np.find_common_type(
[type(data0), type(data1)], scalar_types=[]
).kind
kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind
kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind
if kind0 != "python" and kind1 != "python":
kind = common_kind
itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize)
elif is_bool(data0) or is_bool(data1):
kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object"
itemsize = ""
elif is_complex(data0) or is_complex(data1):
kind = common_kind
itemsize = 16
else:
kind = common_kind
itemsize = 8

expected = np.array(data, dtype=f"{kind}{itemsize}")
result = lib.maybe_convert_objects(arr)
tm.assert_numpy_array_equal(result, expected)

def test_mixed_dtypes_remain_object_array(self):
# GH14956
arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object)
Expand Down
7 changes: 0 additions & 7 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@
import numpy as np
import pytest

from pandas.compat import (
IS64,
is_platform_windows,
)
from pandas.errors import PerformanceWarning

from pandas.core.dtypes.common import is_object_dtype
Expand Down Expand Up @@ -428,9 +424,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
]:
mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch")
request.node.add_marker(mark)
elif is_platform_windows() or not IS64:
mark = pytest.mark.xfail(reason="results are int32, expected int64")
request.node.add_marker(mark)
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def test_from_records_sequencelike(self):
result = DataFrame.from_records(tuples, exclude=exclude)
result.columns = [columns[i] for i in sorted(columns_to_test)]
tm.assert_series_equal(result["C"], df["C"])
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
tm.assert_series_equal(result["E1"], df["E1"])

def test_from_records_sequencelike_empty(self):
# empty case
Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import numpy as np
import pytest

from pandas.compat import np_version_under1p20

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -1514,8 +1516,14 @@ def test_replace_commutative(self, df, to_replace, exp):
np.float64(1),
],
)
def test_replace_replacer_dtype(self, replacer):
def test_replace_replacer_dtype(self, request, replacer):
# GH26632
if np.isscalar(replacer) and replacer.dtype.itemsize < 8:
request.node.add_marker(
pytest.mark.xfail(
np_version_under1p20, reason="np.putmask doesn't coerce dtype"
)
)
df = DataFrame(["a"])
result = df.replace({"a": replacer, "b": replacer})
expected = DataFrame([replacer])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1924,12 +1924,12 @@ def test_constructor_for_list_with_dtypes(self):
# test list of lists/ndarrays
df = DataFrame([np.arange(5) for x in range(5)])
result = df.dtypes
expected = Series([np.dtype("int64")] * 5)
expected = Series([np.dtype("int")] * 5)
tm.assert_series_equal(result, expected)

df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
result = df.dtypes
expected = Series([np.dtype("int64")] * 5)
expected = Series([np.dtype("int32")] * 5)
tm.assert_series_equal(result, expected)

# overflow issue? (we always expected int64 upcasting here)
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,7 @@ def max_value(group):

applied = df.groupby("A").apply(max_value)
result = applied.dtypes
expected = Series(
[np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
index=["A", "B", "C", "D", "value"],
)
expected = df.dtypes
tm.assert_series_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype):
values = klass([True, False, True, True])
else:
values = klass(x * fill_val for x in [5, 6, 7, 8])
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]])
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype)
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

@pytest.mark.parametrize(
Expand Down

0 comments on commit 549e39b

Please sign in to comment.