Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve Extension type on cross section #22785

Merged
merged 10 commits into from
Sep 26, 2018
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ ExtensionType Changes
- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`)
- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`)
- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`).
- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`)

.. _whatsnew_0240.api.incompatibilities:
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ def transpose(self, *args, **kwargs):
"definition self")

@property
def _is_homogeneous(self):
def _is_homogeneous_type(self):
"""Whether the object has a single dtype.

By definition, Series and Index are always considered homogeneous.
Expand All @@ -673,8 +673,8 @@ def _is_homogeneous(self):

See Also
--------
DataFrame._is_homogeneous
MultiIndex._is_homogeneous
DataFrame._is_homogeneous_type
MultiIndex._is_homogeneous_type
"""
return True

Expand Down
11 changes: 6 additions & 5 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,7 @@ def shape(self):
return len(self.index), len(self.columns)

@property
def _is_homogeneous(self):
def _is_homogeneous_type(self):
"""
Whether all the columns in a DataFrame have the same type.

Expand All @@ -624,16 +624,17 @@ def _is_homogeneous(self):

Examples
--------
>>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous
>>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
True
>>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous
>>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
False

Items with the same type but different sizes are considered
different types.

>>> DataFrame({"A": np.array([1, 2], dtype=np.int32),
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous
>>> DataFrame({
... "A": np.array([1, 2], dtype=np.int32),
... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
False
"""
if self._data.any_extension_types:
Expand Down
12 changes: 7 additions & 5 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,21 +289,23 @@ def levels(self):
return self._levels

@property
def _is_homogeneous(self):
def _is_homogeneous_type(self):
"""Whether the levels of a MultiIndex all have the same dtype.

This looks at the dtypes of the levels.

See Also
--------
Index._is_homogeneous
DataFrame._is_homogeneous
Index._is_homogeneous_type
DataFrame._is_homogeneous_type

Examples
--------
>>> MultiIndex.from_tuples([('a', 'b'), ('a', 'c')])._is_homogeneous
>>> MultiIndex.from_tuples([
... ('a', 'b'), ('a', 'c')])._is_homogeneous_type
True
>>> MultiIndex.from_tuples([('a', 1), ('a', 2)])._is_homogeneous
>>> MultiIndex.from_tuples([
... ('a', 1), ('a', 2)])._is_homogeneous_type
False
"""
return len({x.dtype for x in self.levels}) <= 1
Expand Down
41 changes: 30 additions & 11 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
from pandas.util._validators import validate_bool_kwarg
from pandas.compat import range, map, zip

from pandas.core.dtypes.dtypes import (
ExtensionDtype,
PandasExtensionDtype)
from pandas.core.dtypes.common import (
_NS_DTYPE,
is_datetimelike_v_numeric,
Expand Down Expand Up @@ -791,6 +788,11 @@ def _interleave(self):
"""
dtype = _interleaved_dtype(self.blocks)

if is_extension_array_dtype(dtype):
# TODO: https://github.com/pandas-dev/pandas/issues/22791
# Give EAs some input on what happens here. Sparse needs this.
dtype = 'object'

result = np.empty(self.shape, dtype=dtype)

if result.shape[0] == 0:
Expand Down Expand Up @@ -906,14 +908,25 @@ def fast_xs(self, loc):

# unique
dtype = _interleaved_dtype(self.blocks)

n = len(items)
result = np.empty(n, dtype=dtype)
if is_extension_array_dtype(dtype):
# we'll eventually construct an ExtensionArray.
result = np.empty(n, dtype=object)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do people find this confusing? I can either

  1. duplicate the for loop, using list.append for EAs and inserting into result for other
  2. use lists everywhere
  3. use this

I chose this implementation because I assume it's slightly for wide dataframes with a numpy type, compared to building a list an then np.asarray(result) at the end.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This implementation looks good to me

else:
result = np.empty(n, dtype=dtype)

for blk in self.blocks:
# Such assignment may incorrectly coerce NaT to None
# result[blk.mgr_locs] = blk._slice((slice(None), loc))
for i, rl in enumerate(blk.mgr_locs):
result[rl] = blk._try_coerce_result(blk.iget((i, loc)))

if is_extension_array_dtype(dtype):
result = dtype.construct_array_type()._from_sequence(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this gauaranteed to be 1d at this point?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

result is created a few lines above with np.empty(n, dtype=object), so I assume yes

result, dtype=dtype
)

return result

def consolidate(self):
Expand Down Expand Up @@ -1855,16 +1868,22 @@ def _shape_compat(x):


def _interleaved_dtype(blocks):
if not len(blocks):
return None
# type: (List[Block]) -> Optional[Union[np.dtype, ExtensionDtype]]
"""Find the common dtype for `blocks`.

dtype = find_common_type([b.dtype for b in blocks])
Parameters
----------
blocks : List[Block]

# only numpy compat
if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
dtype = np.object
Returns
-------
dtype : Optional[Union[np.dtype, ExtensionDtype]]
None is returned when `blocks` is empty.
"""
if not len(blocks):
return None

return dtype
return find_common_type([b.dtype for b in blocks])


def _consolidate(blocks):
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/frame/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,8 +836,16 @@ def test_constructor_list_str_na(self, string_dtype):
"B": pd.Categorical(['b', 'c'])}), False),

])
def test_is_homogeneous(self, data, expected):
assert data._is_homogeneous is expected
def test_is_homogeneous_type(self, data, expected):
assert data._is_homogeneous_type is expected

def test_asarray_homogenous(self):
df = pd.DataFrame({"A": pd.Categorical([1, 2]),
"B": pd.Categorical([1, 2])})
result = np.asarray(df)
# may change from object in the future
expected = np.array([[1, 1], [2, 2]], dtype='object')
tm.assert_numpy_array_equal(result, expected)


class TestDataFrameDatetimeWithTZ(TestData):
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,3 +1079,31 @@ def test_validate_indices_high():
def test_validate_indices_empty():
with tm.assert_raises_regex(IndexError, "indices are out"):
validate_indices(np.array([0, 1]), 0)


def test_extension_array_cross_section():
# A cross-section of a homogeneous EA should be an EA
df = pd.DataFrame({
"A": pd.core.arrays.integer_array([1, 2]),
"B": pd.core.arrays.integer_array([3, 4])
}, index=['a', 'b'])
expected = pd.Series(pd.core.arrays.integer_array([1, 3]),
index=['A', 'B'], name='a')
result = df.loc['a']
tm.assert_series_equal(result, expected)

result = df.iloc[0]
tm.assert_series_equal(result, expected)


def test_extension_array_cross_section_converts():
df = pd.DataFrame({
"A": pd.core.arrays.integer_array([1, 2]),
"B": np.array([1, 2]),
}, index=['a', 'b'])
result = df.loc['a']
expected = pd.Series([1, 1], dtype=object, index=['A', 'B'], name='a')
tm.assert_series_equal(result, expected)

result = df.iloc[0]
tm.assert_series_equal(result, expected)
4 changes: 2 additions & 2 deletions pandas/tests/indexing/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,8 +738,8 @@ def test_multiindex_contains_dropped(self):
(MultiIndex.from_product([(1, 2), (3, 4)]), True),
(MultiIndex.from_product([('a', 'b'), (1, 2)]), False),
])
def test_multiindex_is_homogeneous(self, data, expected):
assert data._is_homogeneous is expected
def test_multiindex_is_homogeneous_type(self, data, expected):
assert data._is_homogeneous_type is expected


class TestMultiIndexSlicers(object):
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def test_infer_objects_series(self):
assert actual.dtype == 'object'
tm.assert_series_equal(actual, expected)

def test_is_homogeneous(self):
assert Series()._is_homogeneous
assert Series([1, 2])._is_homogeneous
assert Series(pd.Categorical([1, 2]))._is_homogeneous
def test_is_homogeneous_type(self):
assert Series()._is_homogeneous_type
assert Series([1, 2])._is_homogeneous_type
assert Series(pd.Categorical([1, 2]))._is_homogeneous_type