Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG/ENH: groupby quantile arraylike fails with integer columns #30462

Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,8 @@ Other enhancements
- The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`)
- DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`)
- Added `positional` optional parameter in :meth:`DataFrame.reorder_levels` to specify the type of levels to use, labels or positions or both (:issue:`30289`)
- Added `positional` optional parameter in :meth:`MultiIndex.reorder_levels` to specify the type of levels to use, labels or positions or both (:issue:`30289`)


Build Changes
Expand Down Expand Up @@ -845,6 +847,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`)
- Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`)
- Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`)
- Bug in :meth:`GroupBy.quantile` with multiple q values when columns are integers (:issue:`30289`)

Reshaping
^^^^^^^^^
Expand Down
14 changes: 11 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5040,7 +5040,7 @@ def swaplevel(self, i=-2, j=-1, axis=0):
result.columns = result.columns.swaplevel(i, j)
return result

def reorder_levels(self, order, axis=0):
def reorder_levels(self, order, axis=0, positional=None):
"""
Rearrange index levels using input order. May not drop or duplicate levels.

Expand All @@ -5051,6 +5051,14 @@ def reorder_levels(self, order, axis=0):
(position) or by key (label).
axis : int
Where to reorder levels.
positional : bool, optional
How to interpret integer values in `order`.

* None (default): prefer treating the values as labels,
but fall back to positional if no label with that
value is value.
* True : only treat integer values as positions.
* False : only treat integer values as labels.

Returns
-------
Expand All @@ -5063,9 +5071,9 @@ def reorder_levels(self, order, axis=0):
result = self.copy()

if axis == 0:
result.index = result.index.reorder_levels(order)
result.index = result.index.reorder_levels(order, positional=positional)
else:
result.columns = result.columns.reorder_levels(order)
result.columns = result.columns.reorder_levels(order, positional=positional)
return result

# ----------------------------------------------------------------------
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1938,7 +1938,9 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
# but this hits https://github.com/pandas-dev/pandas/issues/10710
# which doesn't reorder the list-like `q` on the inner level.
order = np.roll(list(range(result.index.nlevels)), -1)
result = result.reorder_levels(order)
result = result.reorder_levels(
order, positional=True
) # GH30289: reorder based on position, not labels
result = result.reindex(q, level=-1)

# fix order.
Expand Down
77 changes: 58 additions & 19 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1282,31 +1282,53 @@ def _constructor(self):
def inferred_type(self) -> str:
return "mixed"

def _get_level_number_by_label(self, label) -> int:
try:
level = self.names.index(label)
except ValueError:
raise KeyError(f"Level {label} not found")
return level

def _get_level_number_by_position(self, pos) -> int:
"""
Returns level number at given position

Parameters
----------
pos : level position given in python list index style which may be negative

Raises
------
IndexError if pos is out of range
"""
if pos < 0:
pos += self.nlevels
if pos < 0:
orig_pos = pos - self.nlevels
raise IndexError(
f"Too many levels: Index has only {self.nlevels} levels,"
f" {orig_pos} is not a valid level number"
)
# Note: levels are zero-based
elif pos >= self.nlevels:
raise IndexError(
f"Too many levels: Index has only {self.nlevels} levels, "
f"not {pos + 1}"
)
return pos

def _get_level_number(self, level) -> int:
count = self.names.count(level)
if (count > 1) and not is_integer(level):
raise ValueError(
f"The name {level} occurs multiple times, use a level number"
)
try:
level = self.names.index(level)
except ValueError:
level = self._get_level_number_by_label(level)
except KeyError:
if not is_integer(level):
raise KeyError(f"Level {level} not found")
elif level < 0:
level += self.nlevels
if level < 0:
orig_level = level - self.nlevels
raise IndexError(
f"Too many levels: Index has only {self.nlevels} levels,"
f" {orig_level} is not a valid level number"
)
# Note: levels are zero-based
elif level >= self.nlevels:
raise IndexError(
f"Too many levels: Index has only {self.nlevels} levels, "
f"not {level + 1}"
)
raise
level = self._get_level_number_by_position(level)
return level

_tuples = None
Expand Down Expand Up @@ -2171,18 +2193,35 @@ def swaplevel(self, i=-2, j=-1):
levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
)

def reorder_levels(self, order):
def reorder_levels(self, order, positional=None):
"""
Rearrange levels using input order. May not drop or duplicate levels.

Parameters
----------
order : list
the order of index levels after reorder, could be level labels or positions
positional : bool, optional
How to interpret integer values in `order`.

* None (default): prefer treating the values as labels,
but fall back to positional if no label with that
value is value.
* True : only treat integer values as positions.
* False : only treat integer values as labels.

Returns
-------
MultiIndex
"""
order = [self._get_level_number(i) for i in order]
# GH30289
if positional is None:
order = [self._get_level_number(i) for i in order]
elif positional:
order = [self._get_level_number_by_position(i) for i in order]
else:
order = [self._get_level_number_by_label(i) for i in order]

if len(order) != self.nlevels:
raise AssertionError(
f"Length of order must be same as number of levels ({self.nlevels}),"
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -1398,6 +1398,22 @@ def test_quantile_array_multiple_levels():
tm.assert_frame_equal(result, expected)


def test_groupby_quantile_with_arraylike_q_and_int_columns():
# GH30289
df = pd.DataFrame(np.array([2 * [_ % 4] for _ in range(10)]), columns=[0, 1])

quantiles = [0.5, 0.6]
expected_index = pd.MultiIndex.from_product(
[[0, 1, 2, 3], [0.5, 0.6]], names=[0, None]
)

expected_values = [float(x) for x in [0, 0, 1, 1, 2, 2, 3, 3]]
expected = pd.DataFrame(expected_values, index=expected_index, columns=[1])
result = df.groupby(0).quantile(quantiles)

tm.assert_frame_equal(result, expected)


def test_quantile_raises():
df = pd.DataFrame(
[["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]
Expand Down