From 549e39bf7a4f925424b39e613895594f48dbb1a5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Wed, 21 Apr 2021 08:53:27 -0400 Subject: [PATCH] ENH: Make maybe_convert_object respect dtype itemsize (#40908) --- doc/source/whatsnew/v1.3.0.rst | 3 +- pandas/_libs/lib.pyx | 88 +++++++++++++------ pandas/core/series.py | 2 +- pandas/tests/dtypes/test_inference.py | 64 ++++++++++++++ pandas/tests/extension/test_sparse.py | 7 -- .../frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/methods/test_replace.py | 10 ++- pandas/tests/frame/test_constructors.py | 4 +- pandas/tests/groupby/test_groupby.py | 5 +- pandas/tests/indexing/test_coercion.py | 2 +- 10 files changed, 142 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index dd6f722c801aa..a5a23a2908246 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -221,6 +221,7 @@ Other enhancements - :meth:`pandas.read_csv` and :meth:`pandas.read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) +- Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - @@ -691,7 +692,7 @@ Numeric - Bug in :meth:`DataFrame.apply` and :meth:`DataFrame.agg` when passed argument ``func="size"`` would operate on the entire ``DataFrame`` instead of rows or columns (:issue:`39934`) - Bug in :meth:`DataFrame.transform` would raise ``SpecificationError`` when passed a dictionary and columns were missing; will now raise a ``KeyError`` instead (:issue:`40004`) - Bug in :meth:`DataFrameGroupBy.rank` giving incorrect results with ``pct=True`` and equal values between consecutive groups (:issue:`40518`) -- +- Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) Conversion ^^^^^^^^^^ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a5ed650d72911..77375cac39921 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -68,6 +68,9 @@ cdef extern from "numpy/arrayobject.h": object fields tuple names +cdef extern from "numpy/ndarrayobject.h": + bint PyArray_CheckScalar(obj) nogil + cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 @@ -209,6 +212,24 @@ def is_scalar(val: object) -> bool: or is_offset_object(val)) +cdef inline int64_t get_itemsize(object val): + """ + Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar. + + Parameters + ---------- + val : object + + Returns + ------- + is_ndarray : bool + """ + if PyArray_CheckScalar(val): + return cnp.PyArray_DescrFromScalar(val).itemsize + else: + return -1 + + def is_iterator(obj: object) -> bool: """ Check if the object is an iterator. @@ -2188,7 +2209,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, Parameters ---------- - values : ndarray[object] + objects : ndarray[object] Array of object elements to convert. try_float : bool, default False If an array-like object contains only float or NaN values is @@ -2212,7 +2233,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, Array of converted object values to more specific dtypes if applicable. """ cdef: - Py_ssize_t i, n + Py_ssize_t i, n, itemsize_max = 0 ndarray[float64_t] floats ndarray[complex128_t] complexes ndarray[int64_t] ints @@ -2245,6 +2266,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, for i in range(n): val = objects[i] + if itemsize_max != -1: + itemsize = get_itemsize(val) + if itemsize > itemsize_max or itemsize == -1: + itemsize_max = itemsize if val is None: seen.null_ = True @@ -2346,50 +2371,51 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, seen.object_ = True if not seen.object_: + result = None if not safe: if seen.null_ or seen.nan_: if seen.is_float_or_complex: if seen.complex_: - return complexes + result = complexes elif seen.float_: - return floats + result = floats elif seen.int_: if convert_to_nullable_integer: from pandas.core.arrays import IntegerArray - return IntegerArray(ints, mask) + result = IntegerArray(ints, mask) else: - return floats + result = floats elif seen.nan_: - return floats + result = floats else: if not seen.bool_: if seen.datetime_: if not seen.numeric_ and not seen.timedelta_: - return datetimes + result = datetimes elif seen.timedelta_: if not seen.numeric_: - return timedeltas + result = timedeltas elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: # TODO: array full of NaT ambiguity resolve here needed pass elif convert_datetime: - return datetimes + result = datetimes elif convert_timedelta: - return timedeltas + result = timedeltas else: if seen.complex_: - return complexes + result = complexes elif seen.float_: - return floats + result = floats elif seen.int_: if seen.uint_: - return uints + result = uints else: - return ints + result = ints elif seen.is_bool: - return bools.view(np.bool_) + result = bools.view(np.bool_) else: # don't cast int to float, etc. @@ -2397,41 +2423,49 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=False, if seen.is_float_or_complex: if seen.complex_: if not seen.int_: - return complexes + result = complexes elif seen.float_ or seen.nan_: if not seen.int_: - return floats + result = floats else: if not seen.bool_: if seen.datetime_: if not seen.numeric_ and not seen.timedelta_: - return datetimes + result = datetimes elif seen.timedelta_: if not seen.numeric_: - return timedeltas + result = timedeltas elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: # TODO: array full of NaT ambiguity resolve here needed pass elif convert_datetime: - return datetimes + result = datetimes elif convert_timedelta: - return timedeltas + result = timedeltas else: if seen.complex_: if not seen.int_: - return complexes + result = complexes elif seen.float_ or seen.nan_: if not seen.int_: - return floats + result = floats elif seen.int_: if seen.uint_: - return uints + result = uints else: - return ints + result = ints elif seen.is_bool and not seen.nan_: - return bools.view(np.bool_) + result = bools.view(np.bool_) + + if result is uints or result is ints or result is floats or result is complexes: + # cast to the largest itemsize when all values are NumPy scalars + if itemsize_max > 0 and itemsize_max != result.dtype.itemsize: + result = result.astype(result.dtype.kind + str(itemsize_max)) + return result + elif result is not None: + return result return objects diff --git a/pandas/core/series.py b/pandas/core/series.py index 440bc4c89e647..e19d521bda3df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1891,7 +1891,7 @@ def count(self, level=None): 2 """ if level is None: - return notna(self._values).sum() + return notna(self._values).sum().astype("int64") else: warnings.warn( "Using the level keyword in DataFrame and Series aggregations is " diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 907991b97ead1..d1e6409307915 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -31,6 +31,7 @@ from pandas.core.dtypes.common import ( ensure_int32, is_bool, + is_complex, is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, @@ -614,6 +615,69 @@ def test_maybe_convert_objects_bool_nan(self): out = lib.maybe_convert_objects(ind.values, safe=1) tm.assert_numpy_array_equal(out, exp) + @pytest.mark.parametrize( + "data0", + [ + True, + 1, + 1.0, + 1.0 + 1.0j, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.complex64(1), + np.complex128(1), + ], + ) + @pytest.mark.parametrize( + "data1", + [ + True, + 1, + 1.0, + 1.0 + 1.0j, + np.int8(1), + np.int16(1), + np.int32(1), + np.int64(1), + np.float16(1), + np.float32(1), + np.float64(1), + np.complex64(1), + np.complex128(1), + ], + ) + def test_maybe_convert_objects_itemsize(self, data0, data1): + # GH 40908 + data = [data0, data1] + arr = np.array(data, dtype="object") + + common_kind = np.find_common_type( + [type(data0), type(data1)], scalar_types=[] + ).kind + kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind + kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind + if kind0 != "python" and kind1 != "python": + kind = common_kind + itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize) + elif is_bool(data0) or is_bool(data1): + kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object" + itemsize = "" + elif is_complex(data0) or is_complex(data1): + kind = common_kind + itemsize = 16 + else: + kind = common_kind + itemsize = 8 + + expected = np.array(data, dtype=f"{kind}{itemsize}") + result = lib.maybe_convert_objects(arr) + tm.assert_numpy_array_equal(result, expected) + def test_mixed_dtypes_remain_object_array(self): # GH14956 arr = np.array([datetime(2015, 1, 1, tzinfo=pytz.utc), 1], dtype=object) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 759277a47f62b..f0d3fb7ff9e1b 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -16,10 +16,6 @@ import numpy as np import pytest -from pandas.compat import ( - IS64, - is_platform_windows, -) from pandas.errors import PerformanceWarning from pandas.core.dtypes.common import is_object_dtype @@ -428,9 +424,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ]: mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch") request.node.add_marker(mark) - elif is_platform_windows() or not IS64: - mark = pytest.mark.xfail(reason="results are int32, expected int64") - request.node.add_marker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index e8d0a789e7cbd..35ad9f3e9693b 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -117,7 +117,7 @@ def test_from_records_sequencelike(self): result = DataFrame.from_records(tuples, exclude=exclude) result.columns = [columns[i] for i in sorted(columns_to_test)] tm.assert_series_equal(result["C"], df["C"]) - tm.assert_series_equal(result["E1"], df["E1"].astype("float64")) + tm.assert_series_equal(result["E1"], df["E1"]) def test_from_records_sequencelike_empty(self): # empty case diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index d8f93f047e74b..e6ed60dc2bb08 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -10,6 +10,8 @@ import numpy as np import pytest +from pandas.compat import np_version_under1p20 + import pandas as pd from pandas import ( DataFrame, @@ -1514,8 +1516,14 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) - def test_replace_replacer_dtype(self, replacer): + def test_replace_replacer_dtype(self, request, replacer): # GH26632 + if np.isscalar(replacer) and replacer.dtype.itemsize < 8: + request.node.add_marker( + pytest.mark.xfail( + np_version_under1p20, reason="np.putmask doesn't coerce dtype" + ) + ) df = DataFrame(["a"]) result = df.replace({"a": replacer, "b": replacer}) expected = DataFrame([replacer]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ca68885fdc470..c565567754da0 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1924,12 +1924,12 @@ def test_constructor_for_list_with_dtypes(self): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes - expected = Series([np.dtype("int64")] * 5) + expected = Series([np.dtype("int")] * 5) tm.assert_series_equal(result, expected) df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)]) result = df.dtypes - expected = Series([np.dtype("int64")] * 5) + expected = Series([np.dtype("int32")] * 5) tm.assert_series_equal(result, expected) # overflow issue? (we always expected int64 upcasting here) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c5620d6d8c06c..3f6485be871f1 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -99,10 +99,7 @@ def max_value(group): applied = df.groupby("A").apply(max_value) result = applied.dtypes - expected = Series( - [np.dtype("object")] * 2 + [np.dtype("float64")] * 2 + [np.dtype("int64")], - index=["A", "B", "C", "D", "value"], - ) + expected = df.dtypes tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 7642f78076dcb..2bb9b51df2285 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -641,7 +641,7 @@ def test_where_series_complex128(self, fill_val, exp_dtype): values = klass([True, False, True, True]) else: values = klass(x * fill_val for x in [5, 6, 7, 8]) - exp = klass([1 + 1j, values[1], 3 + 3j, values[3]]) + exp = klass([1 + 1j, values[1], 3 + 3j, values[3]], dtype=exp_dtype) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) @pytest.mark.parametrize(