From e8b37dad224676689a8ae2726974fa9d52703f7b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 06:50:59 -0500 Subject: [PATCH 1/8] ENH: is_homogenous --- pandas/core/base.py | 15 +++++++++++++ pandas/core/frame.py | 28 ++++++++++++++++++++++++ pandas/core/indexes/multi.py | 20 +++++++++++++++++ pandas/tests/frame/test_dtypes.py | 24 ++++++++++++++++++++ pandas/tests/indexing/test_multiindex.py | 8 +++++++ pandas/tests/series/test_dtypes.py | 5 +++++ 6 files changed, 100 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index d831dc69338bd..26fea89b45ae1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -663,6 +663,21 @@ def transpose(self, *args, **kwargs): T = property(transpose, doc="return the transpose, which is by " "definition self") + @property + def _is_homogeneous(self): + """Whether the object has a single dtype. + + By definition, Series and Index are always considered homogeneous. + A MultiIndex may or may not be homogeneous, depending on the + dtypes of the levels. + + See Also + -------- + DataFrame._is_homogeneous + MultiIndex._is_homogeneous + """ + return True + @property def shape(self): """ return a tuple of the shape of the underlying data """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb221ced9e6bd..8e7b3270bda2f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -613,6 +613,34 @@ def shape(self): """ return len(self.index), len(self.columns) + @property + def _is_homogeneous(self): + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous + False + + Items with the type but different sizes are considered different + types. + + >>> DataFrame({"A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous + False + """ + if self._data.any_extension_types: + return len({block.dtype for block in self._data.blocks}) == 1 + else: + return not self._data.is_mixed_type + def _repr_fits_vertical_(self): """ Check length against max_rows. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a7932f667f6de..c0d5bf5c7a08e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -288,6 +288,26 @@ def _verify_integrity(self, labels=None, levels=None): def levels(self): return self._levels + @property + def _is_homogeneous(self): + """Whether the levels of a MultiIndex all have the same dtype. + + This looks at the dtypes of the levels. + + See Also + -------- + Index._is_homogeneous + DataFrame._is_homogeneous + + Examples + -------- + >>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')])._is_homogeneous + True + >>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous + False + """ + return len(set(x.dtype for x in self.levels)) <= 1 + def _set_levels(self, levels, level=None, copy=False, validate=True, verify_integrity=False): # This is NOT part of the levels property because it should be diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 3b3ab3d03dce9..ca4bd64659e06 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -815,6 +815,30 @@ def test_constructor_list_str_na(self, string_dtype): expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) assert_frame_equal(result, expected) + @pytest.mark.parametrize("data, expected", [ + # empty + (DataFrame(), True), + # multi-same + (DataFrame({"A": [1, 2], "B": [1, 2]}), True), + # multi-object + (DataFrame({"A": np.array([1, 2], dtype=object), + "B": np.array(["a", "b"], dtype=object)}), True), + # multi-extension + (DataFrame({"A": pd.Categorical(['a', 'b']), + "B": pd.Categorical(['a', 'b'])}), True), + # differ types + (DataFrame({"A": [1, 2], "B": [1., 2.]}), False), + # differ sizes + (DataFrame({"A": np.array([1, 2], dtype=np.int32), + "B": np.array([1, 2], dtype=np.int64)}), False), + # multi-extension differ + (DataFrame({"A": pd.Categorical(['a', 'b']), + "B": pd.Categorical(['b', 'c'])}), False), + + ]) + def test_is_homogeneous(self, data, expected): + assert data._is_homogeneous is expected + class TestDataFrameDatetimeWithTZ(TestData): diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 9e66dfad3ddc7..aefa8badf72e7 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -733,6 +733,14 @@ def test_multiindex_contains_dropped(self): assert 'a' in idx.levels[0] assert 'a' not in idx + @pytest.mark.parametrize("data, expected", [ + (MultiIndex.from_product([(), ()]), True), + (MultiIndex.from_product([(1, 2), (3, 4)]), True), + (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), + ]) + def test_multiindex_is_homogeneous(self, data, expected): + assert data._is_homogeneous is expected + class TestMultiIndexSlicers(object): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 7aecaf340a3e0..83a458eedbd93 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -508,3 +508,8 @@ def test_infer_objects_series(self): assert actual.dtype == 'object' tm.assert_series_equal(actual, expected) + + def test_is_homogeneous(self): + assert Series()._is_homogeneous + assert Series([1, 2])._is_homogeneous + assert Series(pd.Categorical([1, 2]))._is_homogeneous From 0197e0c562e8d8ee8796cd551cf946448bbd6dfd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 08:31:59 -0500 Subject: [PATCH 2/8] BUG: Preserve dtype on homogeneous EA xs --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/internals/managers.py | 33 +++++++++++++++++++------- pandas/tests/indexing/test_indexing.py | 28 ++++++++++++++++++++++ 3 files changed, 54 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9e2c20c78f489..c16915f492828 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -545,6 +545,7 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) .. _whatsnew_0240.deprecations: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 63738594799f5..b14ccd61a3d44 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -906,14 +906,25 @@ def fast_xs(self, loc): # unique dtype = _interleaved_dtype(self.blocks) + n = len(items) - result = np.empty(n, dtype=dtype) + if is_extension_array_dtype(dtype): + # we'll eventually construct an ExtensionArray. + result = np.empty(n, dtype=object) + else: + result = np.empty(n, dtype=dtype) + for blk in self.blocks: # Such assignment may incorrectly coerce NaT to None # result[blk.mgr_locs] = blk._slice((slice(None), loc)) for i, rl in enumerate(blk.mgr_locs): result[rl] = blk._try_coerce_result(blk.iget((i, loc))) + if is_extension_array_dtype(dtype): + result = dtype.construct_array_type()._from_sequence( + result, dtype=dtype + ) + return result def consolidate(self): @@ -1855,16 +1866,22 @@ def _shape_compat(x): def _interleaved_dtype(blocks): - if not len(blocks): - return None + # type: (List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]] + """Find the common dtype for `blocks`. - dtype = find_common_type([b.dtype for b in blocks]) + Parameters + ---------- + blocks : List[Block] - # only numpy compat - if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): - dtype = np.object + Returns + ------- + dtype : Optional[Union[np.dtype, ExtensionDtype]] + None is returned when `blocks` is empty. + """ + if not len(blocks): + return None - return dtype + return find_common_type([b.dtype for b in blocks]) def _consolidate(blocks): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 761c633f89da3..0f524ca0aaac5 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1079,3 +1079,31 @@ def test_validate_indices_high(): def test_validate_indices_empty(): with tm.assert_raises_regex(IndexError, "indices are out"): validate_indices(np.array([0, 1]), 0) + + +def test_extension_array_cross_section(): + # A cross-section of a homogeneous EA should be an EA + df = pd.DataFrame({ + "A": pd.core.arrays.integer_array([1, 2]), + "B": pd.core.arrays.integer_array([3, 4]) + }, index=['a', 'b']) + expected = pd.Series(pd.core.arrays.integer_array([1, 3]), + index=['A', 'B'], name='a') + result = df.loc['a'] + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + +def test_extension_array_cross_section_converts(): + df = pd.DataFrame({ + "A": pd.core.arrays.integer_array([1, 2]), + "B": np.array([1, 2]), + }, index=['a', 'b']) + result = df.loc['a'] + expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a') + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) From 62326ae00a9ffe1a869e819d9b5ed31cbaa49b26 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 11:04:57 -0500 Subject: [PATCH 3/8] asarray test --- pandas/tests/frame/test_dtypes.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index ca4bd64659e06..d75bc8590e6fa 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -839,6 +839,13 @@ def test_constructor_list_str_na(self, string_dtype): def test_is_homogeneous(self, data, expected): assert data._is_homogeneous is expected + def test_asarray_homogenous(self): + df = pd.DataFrame({"A": pd.Categorical([1, 2]), + "B": pd.Categorical([1, 2])}) + result = np.asarray(df) + expected = np.array([[1, 1], [2, 2,]]) + tm.assert_numpy_array_equal(result, expected) + class TestDataFrameDatetimeWithTZ(TestData): From f008c3874d949563547ddd7c60fa7f1f6bed6ca6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 11:16:19 -0500 Subject: [PATCH 4/8] Fixed asarray --- pandas/core/internals/managers.py | 5 +++++ pandas/tests/frame/test_dtypes.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b14ccd61a3d44..b95686c9ca297 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -791,6 +791,11 @@ def _interleave(self): """ dtype = _interleaved_dtype(self.blocks) + if is_extension_array_dtype(dtype): + # TODO: https://github.com/pandas-dev/pandas/issues/22791 + # Give EAs some input on what happens here. Sparse needs this. + dtype = 'object' + result = np.empty(self.shape, dtype=dtype) if result.shape[0] == 0: diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index d75bc8590e6fa..b8acd83bb3fff 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -843,7 +843,8 @@ def test_asarray_homogenous(self): df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) result = np.asarray(df) - expected = np.array([[1, 1], [2, 2,]]) + # may change from object in the future + expected = np.array([[1, 1], [2, 2,]], dtype='object') tm.assert_numpy_array_equal(result, expected) From 78798cf325cdd4ff0c3910b74a4facfc52720412 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 11:48:40 -0500 Subject: [PATCH 5/8] is_homogeneous -> is_homogeneous_type --- pandas/core/base.py | 6 +++--- pandas/core/frame.py | 11 ++++++----- pandas/core/indexes/multi.py | 12 +++++++----- pandas/tests/frame/test_dtypes.py | 4 ++-- pandas/tests/indexing/test_multiindex.py | 4 ++-- pandas/tests/series/test_dtypes.py | 8 ++++---- 6 files changed, 24 insertions(+), 21 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 26fea89b45ae1..71c3f8de72070 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -664,7 +664,7 @@ def transpose(self, *args, **kwargs): "definition self") @property - def _is_homogeneous(self): + def _is_homogeneous_type(self): """Whether the object has a single dtype. By definition, Series and Index are always considered homogeneous. @@ -673,8 +673,8 @@ def _is_homogeneous(self): See Also -------- - DataFrame._is_homogeneous - MultiIndex._is_homogeneous + DataFrame._is_homogeneous_type + MultiIndex._is_homogeneous_type """ return True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 959b0a4fd1890..12ff867ca9868 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -614,7 +614,7 @@ def shape(self): return len(self.index), len(self.columns) @property - def _is_homogeneous(self): + def _is_homogeneous_type(self): """ Whether all the columns in a DataFrame have the same type. @@ -624,16 +624,17 @@ def _is_homogeneous(self): Examples -------- - >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type True - >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type False Items with the same type but different sizes are considered different types. - >>> DataFrame({"A": np.array([1, 2], dtype=np.int32), - ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous + >>> DataFrame({ + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ if self._data.any_extension_types: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ad38f037b6578..3e6b934e1e863 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -289,21 +289,23 @@ def levels(self): return self._levels @property - def _is_homogeneous(self): + def _is_homogeneous_type(self): """Whether the levels of a MultiIndex all have the same dtype. This looks at the dtypes of the levels. See Also -------- - Index._is_homogeneous - DataFrame._is_homogeneous + Index._is_homogeneous_type + DataFrame._is_homogeneous_type Examples -------- - >>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')])._is_homogeneous + >>> MultiIndex.from_tuples([ + ... ('a', 'b'), ('a', 'c')])._is_homogeneous_type True - >>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous + >>> MultiIndex.from_tuples([ + ... ('a', 1), ('a', 2)])._is_homogeneous_type False """ return len({x.dtype for x in self.levels}) <= 1 diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index b8acd83bb3fff..ff89775ad5c06 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -836,8 +836,8 @@ def test_constructor_list_str_na(self, string_dtype): "B": pd.Categorical(['b', 'c'])}), False), ]) - def test_is_homogeneous(self, data, expected): - assert data._is_homogeneous is expected + def test_is_homogeneous_type(self, data, expected): + assert data._is_homogeneous_type is expected def test_asarray_homogenous(self): df = pd.DataFrame({"A": pd.Categorical([1, 2]), diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index aefa8badf72e7..b8f80164e5402 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -738,8 +738,8 @@ def test_multiindex_contains_dropped(self): (MultiIndex.from_product([(1, 2), (3, 4)]), True), (MultiIndex.from_product([('a', 'b'), (1, 2)]), False), ]) - def test_multiindex_is_homogeneous(self, data, expected): - assert data._is_homogeneous is expected + def test_multiindex_is_homogeneous_type(self, data, expected): + assert data._is_homogeneous_type is expected class TestMultiIndexSlicers(object): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 83a458eedbd93..125dff9ecfa7c 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -509,7 +509,7 @@ def test_infer_objects_series(self): assert actual.dtype == 'object' tm.assert_series_equal(actual, expected) - def test_is_homogeneous(self): - assert Series()._is_homogeneous - assert Series([1, 2])._is_homogeneous - assert Series(pd.Categorical([1, 2]))._is_homogeneous + def test_is_homogeneous_type(self): + assert Series()._is_homogeneous_type + assert Series([1, 2])._is_homogeneous_type + assert Series(pd.Categorical([1, 2]))._is_homogeneous_type From b0514245d12f63f3f77ad2c88c0025fb64a0f174 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 11:49:41 -0500 Subject: [PATCH 6/8] lint --- pandas/tests/frame/test_dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index ff89775ad5c06..c91370dc36770 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -844,7 +844,7 @@ def test_asarray_homogenous(self): "B": pd.Categorical([1, 2])}) result = np.asarray(df) # may change from object in the future - expected = np.array([[1, 1], [2, 2,]], dtype='object') + expected = np.array([[1, 1], [2, 2]], dtype='object') tm.assert_numpy_array_equal(result, expected) From d6a2479cf9ee0c860dff515d308d0e7b19e46b44 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 20 Sep 2018 13:36:53 -0500 Subject: [PATCH 7/8] lint --- pandas/core/internals/managers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b95686c9ca297..2f29f1ae2509f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,9 +12,6 @@ from pandas.util._validators import validate_bool_kwarg from pandas.compat import range, map, zip -from pandas.core.dtypes.dtypes import ( - ExtensionDtype, - PandasExtensionDtype) from pandas.core.dtypes.common import ( _NS_DTYPE, is_datetimelike_v_numeric, From 78dd81e7c74345c1525b2ea7a623415a6c602f87 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 26 Sep 2018 06:39:11 -0500 Subject: [PATCH 8/8] Moved whatsnew to correct section [ci skip] --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 0387294702a74..707257a35983e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -500,6 +500,7 @@ ExtensionType Changes - :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) - :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) - :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) - Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) .. _whatsnew_0240.api.incompatibilities: @@ -553,7 +554,6 @@ Other API Changes - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) .. _whatsnew_0240.deprecations: