Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: preserve EA dtype in transpose #30091

Merged
merged 20 commits into from
Dec 27, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,7 @@ Reshaping
- Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`)
- Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`)
- Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`)
- Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`)
- Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`)
-

Expand Down
33 changes: 29 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,12 +721,37 @@ def transpose(self, *args, **kwargs):
new_axes = self._construct_axes_dict_from(
self, [self._get_axis(x) for x in axes_names]
)
new_values = self.values.transpose(axes_numbers)
if kwargs.pop("copy", None) or (len(args) and args[-1]):
new_values = new_values.copy()

if (
self._is_homogeneous_type
and len(self._data.blocks)
and is_extension_array_dtype(self._data.blocks[0].dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can avoid self._data references by making this len(self.dtypes) and is_extension_array_dtype(self.dtypes.iloc[0])

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto on 731 with self.dtypes

):
kwargs.pop("copy", None) # by definition, we're copying
dtype = self._data.blocks[0].dtype
arr_type = dtype.construct_array_type()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move this logic to pandas/core/reshape/reshape.py this has a lot of similiarity to _unstack_extension_series

# Slow, but unavoidable with 1D EAs.
new_values = []
for i in range(len(self)):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm rethinking this approach. This results in n_rows * n_columns __getitem__s. My intent was to avoid going through a 2D object-dtype ndarray. But we're essentially doing that with lists. So I think it'll be better to just do .values.T and then rebuild the EAs from the object-dtype array.

new_values.append(
arr_type._from_sequence(
[block.values[i] for block in self._data.blocks], dtype=dtype
)
)
columns = new_axes.pop("columns")
new_values = dict(zip(columns, new_values))
result = self._constructor(new_values, **new_axes)

else:
new_values = self.values.transpose(axes_numbers)
if kwargs.pop("copy", None) or (len(args) and args[-1]):
new_values = new_values.copy()

result = self._constructor(new_values, **new_axes)

nv.validate_transpose(tuple(), kwargs)
return self._constructor(new_values, **new_axes).__finalize__(self)
return result.__finalize__(self)

def swapaxes(self, axis1, axis2, copy=True):
"""
Expand Down
19 changes: 0 additions & 19 deletions pandas/tests/arithmetic/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,25 +235,6 @@ def box_df_fail(request):
return request.param


@pytest.fixture(
params=[
(pd.Index, False),
(pd.Series, False),
(pd.DataFrame, False),
pytest.param((pd.DataFrame, True), marks=pytest.mark.xfail),
(tm.to_array, False),
],
ids=id_func,
)
def box_transpose_fail(request):
"""
Fixture similar to `box` but testing both transpose cases for DataFrame,
with the tranpose=True case xfailed.
"""
# GH#23620
return request.param


TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], ids=id_func)
def box_with_array(request):
"""
Expand Down
12 changes: 5 additions & 7 deletions pandas/tests/arithmetic/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,10 +755,10 @@ def test_pi_sub_isub_offset(self):
rng -= pd.offsets.MonthEnd(5)
tm.assert_index_equal(rng, expected)

def test_pi_add_offset_n_gt1(self, box_transpose_fail):
@pytest.mark.parametrize("transpose", [True, False])
def test_pi_add_offset_n_gt1(self, box, transpose):
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
# GH#23215
# add offset to PeriodIndex with freq.n > 1
box, transpose = box_transpose_fail

per = pd.Period("2016-01", freq="2M")
pi = pd.PeriodIndex([per])
Expand Down Expand Up @@ -984,10 +984,9 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq):
with pytest.raises(IncompatibleFrequency, match=msg):
rng -= other

def test_parr_add_sub_td64_nat(self, box_transpose_fail):
@pytest.mark.parametrize("transpose", [True, False])
def test_parr_add_sub_td64_nat(self, box, transpose):
# GH#23320 special handling for timedelta64("NaT")
box, transpose = box_transpose_fail

pi = pd.period_range("1994-04-01", periods=9, freq="19D")
other = np.timedelta64("NaT")
expected = pd.PeriodIndex(["NaT"] * 9, freq="19D")
Expand All @@ -1011,10 +1010,9 @@ def test_parr_add_sub_td64_nat(self, box_transpose_fail):
TimedeltaArray._from_sequence(["NaT"] * 9),
],
)
def test_parr_add_sub_tdt64_nat_array(self, box_df_fail, other):
def test_parr_add_sub_tdt64_nat_array(self, box, other):
# FIXME: DataFrame fails because when when operating column-wise
# timedelta64 entries become NaT and are treated like datetimes
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
box = box_df_fail

pi = pd.period_range("1994-04-01", periods=9, freq="19D")
expected = pd.PeriodIndex(["NaT"] * 9, freq="19D")
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,3 +295,17 @@ def test_ravel(self, data):
# Check that we have a view, not a copy
result[0] = result[1]
assert data[0] == data[1]

def test_transpose(self, data):
df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"])
result = df.T
expected = pd.DataFrame(
{
"a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype),
"b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype),
"c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype),
"d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype),
},
index=["A", "B"],
)
self.assert_frame_equal(result, expected)
4 changes: 4 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,10 @@ def test_merge_on_extension_array_duplicates(self, data):
# Fails creating expected
super().test_merge_on_extension_array_duplicates(data)

@skip_nested
def test_transpose(self, data):
super().test_transpose(data)


class TestSetitem(BaseNumPyTests, base.BaseSetitemTests):
@skip_nested
Expand Down