Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

COMPAT: unique() should preserve the dtype of the input #27874

Merged
merged 2 commits into from
Oct 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ Other API changes

- :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`)
- :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`)
- The returned dtype of ::func:`pd.unique` now matches the input dtype. (:issue:`27874`)
-

.. _whatsnew_1000.api.documentation:
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,13 +180,13 @@ def _reconstruct_data(values, dtype, original):
if is_extension_array_dtype(dtype):
values = dtype.construct_array_type()._from_sequence(values)
elif is_bool_dtype(dtype):
values = values.astype(dtype)
values = values.astype(dtype, copy=False)

# we only support object dtypes bool Index
if isinstance(original, ABCIndexClass):
values = values.astype(object)
values = values.astype(object, copy=False)
elif dtype is not None:
values = values.astype(dtype)
values = values.astype(dtype, copy=False)

return values

Expand Down Expand Up @@ -396,7 +396,7 @@ def unique(values):

table = htable(len(values))
uniques = table.unique(values)
uniques = _reconstruct_data(uniques, dtype, original)
uniques = _reconstruct_data(uniques, original.dtype, original)
stuarteberg marked this conversation as resolved.
Show resolved Hide resolved
return uniques


Expand Down
27 changes: 23 additions & 4 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ def test_memory_usage(self):
class Ops:
def _allow_na_ops(self, obj):
"""Whether to skip test cases including NaN"""
if isinstance(obj, Index) and (obj.is_boolean() or not obj._can_hold_na):
# don't test boolean / int64 index
if (isinstance(obj, Index) and obj.is_boolean()) or not obj._can_hold_na:
# don't test boolean / integer dtypes
return False
return True

Expand All @@ -187,7 +187,24 @@ def setup_method(self, method):
types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"]
self.indexes = [getattr(self, "{}_index".format(t)) for t in types]
self.series = [getattr(self, "{}_series".format(t)) for t in types]
self.objs = self.indexes + self.series

# To test narrow dtypes, we use narrower *data* elements, not *index* elements
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this whole thing badly needs parameterization.

can you pull out the unique tests and do that instead of repeating all of this?

index = self.int_index
self.float32_series = Series(arr.astype(np.float32), index=index, name="a")

arr_int = np.random.choice(10, size=10, replace=False)
self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a")
self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a")
self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a")

self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a")
self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a")
self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a")

nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"]
self.narrow_series = [getattr(self, "{}_series".format(t)) for t in nrw_types]

self.objs = self.indexes + self.series + self.narrow_series

def check_ops_properties(self, props, filter=None, ignore_failures=False):
for op in props:
Expand Down Expand Up @@ -385,6 +402,7 @@ def test_value_counts_unique_nunique(self):
if isinstance(o, Index):
assert isinstance(result, o.__class__)
tm.assert_index_equal(result, orig)
assert result.dtype == orig.dtype
elif is_datetime64tz_dtype(o):
# datetimetz Series returns array of Timestamp
assert result[0] == orig[0]
Expand All @@ -396,6 +414,7 @@ def test_value_counts_unique_nunique(self):
)
else:
tm.assert_numpy_array_equal(result, orig.values)
assert result.dtype == orig.dtype

assert o.nunique() == len(np.unique(o.values))

Expand Down Expand Up @@ -904,7 +923,7 @@ def test_fillna(self):

expected = [fill_value] * 2 + list(values[2:])

expected = klass(expected)
expected = klass(expected, dtype=orig.dtype)
o = klass(values)

# check values has the same dtype as the original
Expand Down