Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Make maybe_convert_object respect dtype itemsize #40908

Merged
merged 14 commits into from
Apr 21, 2021
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ Other enhancements
- :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`)
- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`)
- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`)
- Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`)
-

Expand Down Expand Up @@ -689,7 +690,7 @@ Numeric
- Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`)
- Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`)
- Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`)
-
- Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`)

Conversion
^^^^^^^^^^
Expand Down
88 changes: 61 additions & 27 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ cdef extern from "numpy/arrayobject.h":
object fields
tuple names

cdef extern from "numpy/ndarrayobject.h":
bint PyArray_CheckScalar(obj) nogil


cdef extern from "src/parse_helper.h":
int floatify(object, float64_t *result, int *maybe_int) except -1
Expand Down Expand Up @@ -209,6 +212,24 @@ def is_scalar(val: object) -> bool:
or is_offset_object(val))


cdef inline int64_t get_itemsize(object val):
"""
Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.

Parameters
----------
val : object

Returns
-------
is_ndarray : bool
"""
if PyArray_CheckScalar(val):
return cnp.PyArray_DescrFromScalar(val).itemsize
else:
return -1


def is_iterator(obj: object) -> bool:
"""
Check if the object is an iterator.
Expand Down Expand Up @@ -2188,7 +2209,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,

Parameters
----------
values : ndarray[object]
objects : ndarray[object]
Array of object elements to convert.
try_float : bool, default False
If an array-like object contains only float or NaN values is
Expand All @@ -2212,7 +2233,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
Array of converted object values to more specific dtypes if applicable.
"""
cdef:
Py_ssize_t i, n
Py_ssize_t i, n, itemsize_max = 0
ndarray[float64_t] floats
ndarray[complex128_t] complexes
ndarray[int64_t] ints
Expand Down Expand Up @@ -2245,6 +2266,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,

for i in range(n):
val = objects[i]
if itemsize_max != -1:
itemsize = get_itemsize(val)
if itemsize > itemsize_max or itemsize == -1:
itemsize_max = itemsize

if val is None:
seen.null_ = True
Expand Down Expand Up @@ -2346,92 +2371,101 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False,
seen.object_ = True

if not seen.object_:
result = None
if not safe:
if seen.null_ or seen.nan_:
if seen.is_float_or_complex:
if seen.complex_:
return complexes
result = complexes
elif seen.float_:
return floats
result = floats
elif seen.int_:
if convert_to_nullable_integer:
from pandas.core.arrays import IntegerArray
return IntegerArray(ints, mask)
result = IntegerArray(ints, mask)
else:
return floats
result = floats
elif seen.nan_:
return floats
result = floats
else:
if not seen.bool_:
if seen.datetime_:
if not seen.numeric_ and not seen.timedelta_:
return datetimes
result = datetimes
elif seen.timedelta_:
if not seen.numeric_:
return timedeltas
result = timedeltas
elif seen.nat_:
if not seen.numeric_:
if convert_datetime and convert_timedelta:
# TODO: array full of NaT ambiguity resolve here needed
pass
elif convert_datetime:
return datetimes
result = datetimes
elif convert_timedelta:
return timedeltas
result = timedeltas
else:
if seen.complex_:
return complexes
result = complexes
elif seen.float_:
return floats
result = floats
elif seen.int_:
if seen.uint_:
return uints
result = uints
else:
return ints
result = ints
elif seen.is_bool:
return bools.view(np.bool_)
result = bools.view(np.bool_)

else:
# don't cast int to float, etc.
if seen.null_:
if seen.is_float_or_complex:
if seen.complex_:
if not seen.int_:
return complexes
result = complexes
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
result = floats
else:
if not seen.bool_:
if seen.datetime_:
if not seen.numeric_ and not seen.timedelta_:
return datetimes
result = datetimes
elif seen.timedelta_:
if not seen.numeric_:
return timedeltas
result = timedeltas
elif seen.nat_:
if not seen.numeric_:
if convert_datetime and convert_timedelta:
# TODO: array full of NaT ambiguity resolve here needed
pass
elif convert_datetime:
return datetimes
result = datetimes
elif convert_timedelta:
return timedeltas
result = timedeltas
else:
if seen.complex_:
if not seen.int_:
return complexes
result = complexes
elif seen.float_ or seen.nan_:
if not seen.int_:
return floats
result = floats
elif seen.int_:
if seen.uint_:
return uints
result = uints
else:
return ints
result = ints
elif seen.is_bool and not seen.nan_:
return bools.view(np.bool_)
result = bools.view(np.bool_)

if result is uints or result is ints or result is floats or result is complexes:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you put a blank line & comment here (e.g. casting to itemsize)

# cast to the largest itemsize when all values are NumPy scalars
if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
result = result.astype(result.dtype.kind + str(itemsize_max))
return result
elif result is not None:
return result

return objects

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1891,7 +1891,7 @@ def count(self, level=None):
2
"""
if level is None:
return notna(self._values).sum()
return notna(self._values).sum().astype("int64")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All paths for Series.count and DataFrame.count result in int64 except for this one.

else:
warnings.warn(
"Using the level keyword in DataFrame and Series aggregations is "
Expand Down
64 changes: 64 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from pandas.core.dtypes.common import (
ensure_int32,
is_bool,
is_complex,
is_datetime64_any_dtype,
is_datetime64_dtype,
is_datetime64_ns_dtype,
Expand Down Expand Up @@ -614,6 +615,69 @@ def test_maybe_convert_objects_bool_nan(self):
out = lib.maybe_convert_objects(ind.values, safe=1)
tm.assert_numpy_array_equal(out, exp)

@pytest.mark.parametrize(
"data0",
[
True,
1,
1.0,
1.0 + 1.0j,
np.int8(1),
np.int16(1),
np.int32(1),
np.int64(1),
np.float16(1),
np.float32(1),
np.float64(1),
np.complex64(1),
np.complex128(1),
],
)
@pytest.mark.parametrize(
"data1",
[
True,
1,
1.0,
1.0 + 1.0j,
np.int8(1),
np.int16(1),
np.int32(1),
np.int64(1),
np.float16(1),
np.float32(1),
np.float64(1),
np.complex64(1),
np.complex128(1),
],
)
def test_maybe_convert_objects_itemsize(self, data0, data1):
# GH 40908
data = [data0, data1]
arr = np.array(data, dtype="object")

common_kind = np.find_common_type(
[type(data0), type(data1)], scalar_types=[]
).kind
kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind
kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind
if kind0 != "python" and kind1 != "python":
kind = common_kind
itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize)
elif is_bool(data0) or is_bool(data1):
kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object"
itemsize = ""
elif is_complex(data0) or is_complex(data1):
kind = common_kind
itemsize = 16
else:
kind = common_kind
itemsize = 8

expected = np.array(data, dtype=f"{kind}{itemsize}")
result = lib.maybe_convert_objects(arr)
tm.assert_numpy_array_equal(result, expected)

def test_mixed_dtypes_remain_object_array(self):
# GH14956
arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object)
Expand Down
7 changes: 0 additions & 7 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@
import numpy as np
import pytest

from pandas.compat import (
IS64,
is_platform_windows,
)
from pandas.errors import PerformanceWarning

from pandas.core.dtypes.common import is_object_dtype
Expand Down Expand Up @@ -428,9 +424,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
]:
mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch")
request.node.add_marker(mark)
elif is_platform_windows() or not IS64:
mark = pytest.mark.xfail(reason="results are int32, expected int64")
request.node.add_marker(mark)
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def test_from_records_sequencelike(self):
result = DataFrame.from_records(tuples, exclude=exclude)
result.columns = [columns[i] for i in sorted(columns_to_test)]
tm.assert_series_equal(result["C"], df["C"])
tm.assert_series_equal(result["E1"], df["E1"].astype("float64"))
tm.assert_series_equal(result["E1"], df["E1"])

def test_from_records_sequencelike_empty(self):
# empty case
Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import numpy as np
import pytest

from pandas.compat import np_version_under1p20

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -1514,8 +1516,14 @@ def test_replace_commutative(self, df, to_replace, exp):
np.float64(1),
],
)
def test_replace_replacer_dtype(self, replacer):
def test_replace_replacer_dtype(self, request, replacer):
# GH26632
if np.isscalar(replacer) and replacer.dtype.itemsize < 8:
request.node.add_marker(
pytest.mark.xfail(
np_version_under1p20, reason="np.putmask doesn't coerce dtype"
)
)
df = DataFrame(["a"])
result = df.replace({"a": replacer, "b": replacer})
expected = DataFrame([replacer])
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1924,12 +1924,12 @@ def test_constructor_for_list_with_dtypes(self):
# test list of lists/ndarrays
df = DataFrame([np.arange(5) for x in range(5)])
result = df.dtypes
expected = Series([np.dtype("int64")] * 5)
expected = Series([np.dtype("int")] * 5)
tm.assert_series_equal(result, expected)

df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
result = df.dtypes
expected = Series([np.dtype("int64")] * 5)
expected = Series([np.dtype("int32")] * 5)
tm.assert_series_equal(result, expected)

# overflow issue? (we always expected int64 upcasting here)
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,7 @@ def max_value(group):

applied = df.groupby("A").apply(max_value)
result = applied.dtypes
expected = Series(
[np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")],
index=["A", "B", "C", "D", "value"],
)
expected = df.dtypes
tm.assert_series_equal(result, expected)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype):
values = klass([True, False, True, True])
else:
values = klass(x * fill_val for x in [5, 6, 7, 8])
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]])
exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype)
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

@pytest.mark.parametrize(
Expand Down