Skip to content

Commit

Permalink
FEAT-#2663: Add algebraic operator from_labels (#2665)
Browse files Browse the repository at this point in the history
Resolves #2663

This operator is necessary for efficient `reset_index` operations. See
this paper for more information on the operator:
http://www.vldb.org/pvldb/vol13/p2033-petersohn.pdf

Co-authored-by: William Ma <12377941+williamma12@users.noreply.github.com>

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>
  • Loading branch information
devin-petersohn authored Jan 31, 2021
1 parent f2a7271 commit 03ea9b2
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 10 deletions.
12 changes: 2 additions & 10 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,16 +533,8 @@ def reset_index(self, **kwargs):
if level is not None or self.has_multiindex():
return self.default_to_pandas(pandas.DataFrame.reset_index, **kwargs)
if not drop:
new_column_name = (
self.index.name
if self.index.name is not None
else "index"
if "index" not in self.columns
else "level_0"
)
new_self = self.insert(0, new_column_name, self.index)
else:
new_self = self.copy()
return self.__constructor__(self._modin_frame.from_labels())
new_self = self.copy()
new_self.index = pandas.RangeIndex(len(new_self.index))
return new_self

Expand Down
61 changes: 61 additions & 0 deletions modin/engines/base/frame/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,67 @@ def mask(
row_numeric_idx=new_row_order, col_numeric_idx=new_col_order
)

def from_labels(self) -> "BasePandasFrame":
"""Convert the row labels to a column of data, inserted at the first position.
Returns
-------
BasePandasFrame
A new BasePandasFrame.
"""
new_row_labels = pandas.RangeIndex(len(self.index))
# Column labels are different for multilevel index.
if len(self.index.names) > 1:
# We will also use the `new_column_names` in the calculation of the internal metadata, so this is a
# lightweight way of ensuring the metadata matches.
new_column_names = pandas.Index(
[
self.index.names[i]
if self.index.names[i] is not None
else "level_{}".format(i)
for i in range(len(self.index.names))
]
)
new_columns = new_column_names.append(self.columns)
else:
# See note above about usage of `new_column_names`.
new_column_names = pandas.Index(
[
self.index.names[0]
if self.index.names[0] is not None
else "index"
if "index" not in self.columns
else "level_{}".format(0)
]
)
new_columns = new_column_names.append(self.columns)

def from_labels_executor(df, **kwargs):
# Setting the names here ensures that external and internal metadata always match.
df.index.names = new_column_names
return df.reset_index()

new_parts = self._frame_mgr_cls.apply_func_to_select_indices(
0,
self._partitions,
from_labels_executor,
[0],
keep_remaining=True,
)
new_column_widths = [
len(self.index.names) + self._column_widths[0]
] + self._column_widths[1:]
result = self.__constructor__(
new_parts,
new_row_labels,
new_columns,
row_lengths=self._row_lengths_cache,
column_widths=new_column_widths,
)
# Propagate the new row labels to the all dataframe partitions
result._apply_index_objs(0)
return result

def reorder_labels(self, row_numeric_idx=None, col_numeric_idx=None):
"""Reorder the column and or rows in this DataFrame.
Expand Down

0 comments on commit 03ea9b2

Please sign in to comment.