Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#1291 #1187: Add DataFrame.unstack, Series.unstack #1649

Merged
merged 1 commit into from
Aug 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``tz_localize`` | `tz_localize`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``unstack`` | `unstack`_ | D | |
| ``unstack`` | `unstack`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``update`` | `update`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+
| ``unique`` | Y |
+-----------------------------+---------------------------------+
| ``unstack`` | D |
| ``unstack`` | Y |
+-----------------------------+---------------------------------+
| ``update`` | Y |
+-----------------------------+---------------------------------+
Expand Down
4 changes: 4 additions & 0 deletions modin/backends/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,10 @@ def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args):

# END Manual Partitioning methods

@abc.abstractmethod
def unstack(self, level, fill_value):
pass

@abc.abstractmethod
def get_dummies(self, columns, **kwargs):
"""Convert categorical variables to dummy variables for certain columns.
Expand Down
87 changes: 80 additions & 7 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ def resample_app_ser(self, resample_args, func, *args, **kwargs):
df_op=lambda df: df.squeeze(axis=1),
func=func,
*args,
**kwargs
**kwargs,
)

def resample_app_df(self, resample_args, func, *args, **kwargs):
Expand All @@ -806,7 +806,7 @@ def resample_agg_ser(self, resample_args, func, *args, **kwargs):
df_op=lambda df: df.squeeze(axis=1),
func=func,
*args,
**kwargs
**kwargs,
)

def resample_agg_df(self, resample_args, func, *args, **kwargs):
Expand Down Expand Up @@ -851,7 +851,7 @@ def resample_interpolate(
limit_direction,
limit_area,
downcast,
**kwargs
**kwargs,
):
return self._resample_func(
resample_args,
Expand All @@ -862,7 +862,7 @@ def resample_interpolate(
limit_direction=limit_direction,
limit_area=limit_area,
downcast=downcast,
**kwargs
**kwargs,
)

def resample_count(self, resample_args):
Expand Down Expand Up @@ -910,7 +910,7 @@ def resample_ohlc_ser(self, resample_args, _method, *args, **kwargs):
df_op=lambda df: df.squeeze(axis=1),
_method=_method,
*args,
**kwargs
**kwargs,
)

def resample_ohlc_df(self, resample_args, _method, *args, **kwargs):
Expand Down Expand Up @@ -1075,6 +1075,79 @@ def rolling_aggregate(self, rolling_args, func, *args, **kwargs):
)
return self.__constructor__(new_modin_frame)

def unstack(self, level, fill_value):
if not isinstance(self.index, pandas.MultiIndex) or (
isinstance(self.index, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.index.nlevels
):
axis = 1
new_columns = ["__reduced__"]
need_reindex = True
else:
axis = 0
new_columns = None
need_reindex = False

def map_func(df):
return pandas.DataFrame(df.unstack(level=level, fill_value=fill_value))

is_all_multi_list = False
if (
isinstance(self.index, pandas.MultiIndex)
and isinstance(self.columns, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.index.nlevels
):
is_all_multi_list = True
real_cols_bkp = self.columns
obj = self.copy()
obj.columns = np.arange(len(obj.columns))
else:
obj = self

new_modin_frame = obj._modin_frame._apply_full_axis(
axis, map_func, new_columns=new_columns
)
result = self.__constructor__(new_modin_frame)

if is_all_multi_list:
result = result.sort_index()
index_level_values = [lvl for lvl in obj.index.levels]
columns_level_values = [
real_cols_bkp.get_level_values(lvl).unique()
for lvl in np.arange(real_cols_bkp.nlevels)
]
result.index = pandas.MultiIndex.from_product(
[*columns_level_values, *index_level_values]
)
return result

if need_reindex:
if isinstance(self.index, pandas.MultiIndex):
index_level_values = [
self.index.get_level_values(lvl).unique()
for lvl in np.arange(self.index.nlevels)
]
new_index = pandas.MultiIndex.from_product(
[self.columns, *index_level_values]
)
else:
if isinstance(self.columns, pandas.MultiIndex):
columns_level_values = [
self.columns.get_level_values(lvl).unique()
for lvl in np.arange(self.columns.nlevels)
]
new_index = pandas.MultiIndex.from_product(
[*columns_level_values, self.index]
)
else:
new_index = pandas.MultiIndex.from_product(
[self.columns, self.index]
)
result = result.reindex(0, new_index)
return result

# Map partitions operations
# These operations are operations that apply a function to every partition.
abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy")
Expand Down Expand Up @@ -1632,7 +1705,7 @@ def sort_index(self, **kwargs):
axis=axis,
level=level,
sort_remaining=sort_remaining,
**kwargs
**kwargs,
)

# sort_index can have ascending be None and behaves as if it is False.
Expand Down Expand Up @@ -2147,7 +2220,7 @@ def compute_groupby(df):
try:
agg_func(
pandas.DataFrame(index=[1], columns=[1]).groupby(level=0),
**agg_args
**agg_args,
)
except Exception as e:
raise type(e)("No numeric types to aggregate.")
Expand Down
3 changes: 0 additions & 3 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3333,9 +3333,6 @@ def tz_localize(
)
return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)

def unstack(self, level=-1, fill_value=None):
return self._default_to_pandas("unstack", level=level, fill_value=fill_value)

def var(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
Expand Down
31 changes: 31 additions & 0 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1680,6 +1680,37 @@ def slice_shift(self, periods=1, axis=0):
new_df.columns = new_columns
return new_df

def unstack(self, level=-1, fill_value=None):
"""
Pivot a level of the (necessarily hierarchical) index labels.
Returns a DataFrame having a new level of column labels whose inner-most level
consists of the pivoted index labels.
If the index is not a MultiIndex, the output will be a Series
(the analogue of stack when the columns are not a MultiIndex).
The level involved will automatically get sorted.
Parameters
----------
level : int, str, or list of these, default -1 (last level)
Level(s) of index to unstack, can pass level name.
fill_value : int, str or dict
Replace NaN with this value if the unstack produces missing values.
Returns
-------
Series or DataFrame
"""
if not isinstance(self.index, pandas.MultiIndex) or (
isinstance(self.index, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.index.nlevels
):
return self._reduce_dimension(
query_compiler=self._query_compiler.unstack(level, fill_value)
)
else:
return DataFrame(
query_compiler=self._query_compiler.unstack(level, fill_value)
)

def pivot(self, index=None, columns=None, values=None):
return self._default_to_pandas(
pandas.DataFrame.pivot, index=index, columns=columns, values=values
Expand Down
23 changes: 23 additions & 0 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,6 +1079,29 @@ def slice_shift(self, periods=1, axis=0):
)
)

def unstack(self, level=-1, fill_value=None):
"""
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
The level involved will automatically get sorted.
Parameters
----------
level : int, str, or list of these, default last level
Level(s) to unstack, can pass level name.
fill_value : scalar value, default None
Value to use when replacing NaN values.
Returns
-------
DataFrame
Unstacked Series.
"""
from .dataframe import DataFrame

result = DataFrame(
query_compiler=self._query_compiler.unstack(level, fill_value)
)

return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result

@property
def plot(
self,
Expand Down
76 changes: 72 additions & 4 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2950,10 +2950,78 @@ def test_tz_localize(self):
pandas_df.tz_localize("America/Los_Angeles", axis=0),
)

def test_unstack(self):
data = test_data_values[0]
with pytest.warns(UserWarning):
pd.DataFrame(data).unstack()
@pytest.mark.parametrize(
"is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]
)
@pytest.mark.parametrize(
"is_multi_col", [True, False], ids=["col_multi", "col_index"]
)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_unstack(self, data, is_multi_idx, is_multi_col):
pandas_df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)

if is_multi_idx:
if len(pandas_df.index) == 256:
index = pd.MultiIndex.from_product(
[
["a", "b", "c", "d"],
["x", "y", "z", "last"],
["i", "j", "k", "index"],
[1, 2, 3, 4],
]
)
elif len(pandas_df.index) == 100:
index = pd.MultiIndex.from_product(
[
["x", "y", "z", "last"],
["a", "b", "c", "d", "f"],
["i", "j", "k", "l", "index"],
]
)
else:
index = pandas_df.index

if is_multi_col:
if len(pandas_df.columns) == 64:
columns = pd.MultiIndex.from_product(
[
["A", "B", "C", "D"],
["xx", "yy", "zz", "LAST"],
[10, 20, 30, 40],
]
)
elif len(pandas_df.columns) == 100:
columns = pd.MultiIndex.from_product(
[
["xx", "yy", "zz", "LAST"],
["A", "B", "C", "D", "F"],
["I", "J", "K", "L", "INDEX"],
]
)
else:
columns = pandas_df.columns

pandas_df.columns = columns
pandas_df.index = index

modin_df.columns = columns
modin_df.index = index

df_equals(modin_df.unstack(), pandas_df.unstack())

if is_multi_idx:
df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1))
df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1]))
df_equals(
modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2])
)

if len(pandas_df.index) == 256:
df_equals(
modin_df.unstack(level=[0, 1, 2, 3]),
pandas_df.unstack(level=[0, 1, 2, 3]),
)

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___array__(self, data):
Expand Down
46 changes: 32 additions & 14 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3073,21 +3073,39 @@ def test_unique(data):
assert_array_equal(modin_result, pandas_result)


def test_unstack():
s = pd.Series(
np.random.randint(1, 100, 12),
index=pd.MultiIndex.from_tuples(
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_unstack(data):
modin_series, pandas_series = create_test_series(data)
if len(pandas_series.index) == 256:
index = pd.MultiIndex.from_product(
[
(num, letter, color)
for num in range(1, 3)
for letter in ["a", "b", "c"]
for color in ["Red", "Green"]
],
names=["Number", "Letter", "Color"],
),
)
with pytest.warns(UserWarning):
s.unstack()
["a", "b", "c", "d"],
["x", "y", "z", "last"],
["i", "j", "k", "index"],
[1, 2, 3, 4],
]
)
elif len(pandas_series.index) == 100:
index = pd.MultiIndex.from_product(
[
["x", "y", "z", "last"],
["a", "b", "c", "d", "f"],
["i", "j", "k", "l", "index"],
]
)

modin_series = pd.Series(data[next(iter(data.keys()))], index=index)
pandas_series = pandas.Series(data[next(iter(data.keys()))], index=index)

df_equals(modin_series.unstack(), pandas_series.unstack())
df_equals(modin_series.unstack(level=0), pandas_series.unstack(level=0))
df_equals(modin_series.unstack(level=[0, 1]), pandas_series.unstack(level=[0, 1]))

if len(pandas_series.index) == 256:
df_equals(
modin_series.unstack(level=[0, 1, 2]),
pandas_series.unstack(level=[0, 1, 2]),
)


@pytest.mark.parametrize(
Expand Down