Skip to content

Commit

Permalink
FEAT-modin-project#1291 modin-project#1187: Add DataFrame.unstack, …
Browse files Browse the repository at this point in the history
…`Series.unstack` (modin-project#1649)

Signed-off-by: Alexey Prutskov <alexey.prutskov@intel.com>
  • Loading branch information
prutskov authored and aregm committed Sep 16, 2020
1 parent 70c1278 commit 605deb4
Show file tree
Hide file tree
Showing 9 changed files with 244 additions and 30 deletions.
2 changes: 1 addition & 1 deletion docs/supported_apis/dataframe_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ default to pandas.
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``tz_localize`` | `tz_localize`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``unstack`` | `unstack`_ | D | |
| ``unstack`` | `unstack`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
| ``update`` | `update`_ | Y | |
+----------------------------+---------------------------+------------------------+----------------------------------------------------+
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_apis/series_supported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ the related section on `Defaulting to pandas`_.
+-----------------------------+---------------------------------+
| ``unique`` | Y |
+-----------------------------+---------------------------------+
| ``unstack`` | D |
| ``unstack`` | Y |
+-----------------------------+---------------------------------+
| ``update`` | Y |
+-----------------------------+---------------------------------+
Expand Down
4 changes: 4 additions & 0 deletions modin/backends/base/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,10 @@ def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args):

# END Manual Partitioning methods

@abc.abstractmethod
def unstack(self, level, fill_value):
pass

@abc.abstractmethod
def get_dummies(self, columns, **kwargs):
"""Convert categorical variables to dummy variables for certain columns.
Expand Down
87 changes: 80 additions & 7 deletions modin/backends/pandas/query_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,7 @@ def resample_app_ser(self, resample_args, func, *args, **kwargs):
df_op=lambda df: df.squeeze(axis=1),
func=func,
*args,
**kwargs
**kwargs,
)

def resample_app_df(self, resample_args, func, *args, **kwargs):
Expand All @@ -806,7 +806,7 @@ def resample_agg_ser(self, resample_args, func, *args, **kwargs):
df_op=lambda df: df.squeeze(axis=1),
func=func,
*args,
**kwargs
**kwargs,
)

def resample_agg_df(self, resample_args, func, *args, **kwargs):
Expand Down Expand Up @@ -851,7 +851,7 @@ def resample_interpolate(
limit_direction,
limit_area,
downcast,
**kwargs
**kwargs,
):
return self._resample_func(
resample_args,
Expand All @@ -862,7 +862,7 @@ def resample_interpolate(
limit_direction=limit_direction,
limit_area=limit_area,
downcast=downcast,
**kwargs
**kwargs,
)

def resample_count(self, resample_args):
Expand Down Expand Up @@ -910,7 +910,7 @@ def resample_ohlc_ser(self, resample_args, _method, *args, **kwargs):
df_op=lambda df: df.squeeze(axis=1),
_method=_method,
*args,
**kwargs
**kwargs,
)

def resample_ohlc_df(self, resample_args, _method, *args, **kwargs):
Expand Down Expand Up @@ -1075,6 +1075,79 @@ def rolling_aggregate(self, rolling_args, func, *args, **kwargs):
)
return self.__constructor__(new_modin_frame)

def unstack(self, level, fill_value):
if not isinstance(self.index, pandas.MultiIndex) or (
isinstance(self.index, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.index.nlevels
):
axis = 1
new_columns = ["__reduced__"]
need_reindex = True
else:
axis = 0
new_columns = None
need_reindex = False

def map_func(df):
return pandas.DataFrame(df.unstack(level=level, fill_value=fill_value))

is_all_multi_list = False
if (
isinstance(self.index, pandas.MultiIndex)
and isinstance(self.columns, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.index.nlevels
):
is_all_multi_list = True
real_cols_bkp = self.columns
obj = self.copy()
obj.columns = np.arange(len(obj.columns))
else:
obj = self

new_modin_frame = obj._modin_frame._apply_full_axis(
axis, map_func, new_columns=new_columns
)
result = self.__constructor__(new_modin_frame)

if is_all_multi_list:
result = result.sort_index()
index_level_values = [lvl for lvl in obj.index.levels]
columns_level_values = [
real_cols_bkp.get_level_values(lvl).unique()
for lvl in np.arange(real_cols_bkp.nlevels)
]
result.index = pandas.MultiIndex.from_product(
[*columns_level_values, *index_level_values]
)
return result

if need_reindex:
if isinstance(self.index, pandas.MultiIndex):
index_level_values = [
self.index.get_level_values(lvl).unique()
for lvl in np.arange(self.index.nlevels)
]
new_index = pandas.MultiIndex.from_product(
[self.columns, *index_level_values]
)
else:
if isinstance(self.columns, pandas.MultiIndex):
columns_level_values = [
self.columns.get_level_values(lvl).unique()
for lvl in np.arange(self.columns.nlevels)
]
new_index = pandas.MultiIndex.from_product(
[*columns_level_values, self.index]
)
else:
new_index = pandas.MultiIndex.from_product(
[self.columns, self.index]
)
result = result.reindex(0, new_index)
return result

# Map partitions operations
# These operations are operations that apply a function to every partition.
abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy")
Expand Down Expand Up @@ -1632,7 +1705,7 @@ def sort_index(self, **kwargs):
axis=axis,
level=level,
sort_remaining=sort_remaining,
**kwargs
**kwargs,
)

# sort_index can have ascending be None and behaves as if it is False.
Expand Down Expand Up @@ -2147,7 +2220,7 @@ def compute_groupby(df):
try:
agg_func(
pandas.DataFrame(index=[1], columns=[1]).groupby(level=0),
**agg_args
**agg_args,
)
except Exception as e:
raise type(e)("No numeric types to aggregate.")
Expand Down
3 changes: 0 additions & 3 deletions modin/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3333,9 +3333,6 @@ def tz_localize(
)
return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)

def unstack(self, level=-1, fill_value=None):
return self._default_to_pandas("unstack", level=level, fill_value=fill_value)

def var(
self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
):
Expand Down
31 changes: 31 additions & 0 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1680,6 +1680,37 @@ def slice_shift(self, periods=1, axis=0):
new_df.columns = new_columns
return new_df

def unstack(self, level=-1, fill_value=None):
"""
Pivot a level of the (necessarily hierarchical) index labels.
Returns a DataFrame having a new level of column labels whose inner-most level
consists of the pivoted index labels.
If the index is not a MultiIndex, the output will be a Series
(the analogue of stack when the columns are not a MultiIndex).
The level involved will automatically get sorted.
Parameters
----------
level : int, str, or list of these, default -1 (last level)
Level(s) of index to unstack, can pass level name.
fill_value : int, str or dict
Replace NaN with this value if the unstack produces missing values.
Returns
-------
Series or DataFrame
"""
if not isinstance(self.index, pandas.MultiIndex) or (
isinstance(self.index, pandas.MultiIndex)
and is_list_like(level)
and len(level) == self.index.nlevels
):
return self._reduce_dimension(
query_compiler=self._query_compiler.unstack(level, fill_value)
)
else:
return DataFrame(
query_compiler=self._query_compiler.unstack(level, fill_value)
)

def pivot(self, index=None, columns=None, values=None):
return self._default_to_pandas(
pandas.DataFrame.pivot, index=index, columns=columns, values=values
Expand Down
23 changes: 23 additions & 0 deletions modin/pandas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,6 +1079,29 @@ def slice_shift(self, periods=1, axis=0):
)
)

def unstack(self, level=-1, fill_value=None):
"""
Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
The level involved will automatically get sorted.
Parameters
----------
level : int, str, or list of these, default last level
Level(s) to unstack, can pass level name.
fill_value : scalar value, default None
Value to use when replacing NaN values.
Returns
-------
DataFrame
Unstacked Series.
"""
from .dataframe import DataFrame

result = DataFrame(
query_compiler=self._query_compiler.unstack(level, fill_value)
)

return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result

@property
def plot(
self,
Expand Down
76 changes: 72 additions & 4 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2950,10 +2950,78 @@ def test_tz_localize(self):
pandas_df.tz_localize("America/Los_Angeles", axis=0),
)

def test_unstack(self):
data = test_data_values[0]
with pytest.warns(UserWarning):
pd.DataFrame(data).unstack()
@pytest.mark.parametrize(
"is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]
)
@pytest.mark.parametrize(
"is_multi_col", [True, False], ids=["col_multi", "col_index"]
)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_unstack(self, data, is_multi_idx, is_multi_col):
pandas_df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)

if is_multi_idx:
if len(pandas_df.index) == 256:
index = pd.MultiIndex.from_product(
[
["a", "b", "c", "d"],
["x", "y", "z", "last"],
["i", "j", "k", "index"],
[1, 2, 3, 4],
]
)
elif len(pandas_df.index) == 100:
index = pd.MultiIndex.from_product(
[
["x", "y", "z", "last"],
["a", "b", "c", "d", "f"],
["i", "j", "k", "l", "index"],
]
)
else:
index = pandas_df.index

if is_multi_col:
if len(pandas_df.columns) == 64:
columns = pd.MultiIndex.from_product(
[
["A", "B", "C", "D"],
["xx", "yy", "zz", "LAST"],
[10, 20, 30, 40],
]
)
elif len(pandas_df.columns) == 100:
columns = pd.MultiIndex.from_product(
[
["xx", "yy", "zz", "LAST"],
["A", "B", "C", "D", "F"],
["I", "J", "K", "L", "INDEX"],
]
)
else:
columns = pandas_df.columns

pandas_df.columns = columns
pandas_df.index = index

modin_df.columns = columns
modin_df.index = index

df_equals(modin_df.unstack(), pandas_df.unstack())

if is_multi_idx:
df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1))
df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1]))
df_equals(
modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2])
)

if len(pandas_df.index) == 256:
df_equals(
modin_df.unstack(level=[0, 1, 2, 3]),
pandas_df.unstack(level=[0, 1, 2, 3]),
)

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test___array__(self, data):
Expand Down
46 changes: 32 additions & 14 deletions modin/pandas/test/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3073,21 +3073,39 @@ def test_unique(data):
assert_array_equal(modin_result, pandas_result)


def test_unstack():
s = pd.Series(
np.random.randint(1, 100, 12),
index=pd.MultiIndex.from_tuples(
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_unstack(data):
modin_series, pandas_series = create_test_series(data)
if len(pandas_series.index) == 256:
index = pd.MultiIndex.from_product(
[
(num, letter, color)
for num in range(1, 3)
for letter in ["a", "b", "c"]
for color in ["Red", "Green"]
],
names=["Number", "Letter", "Color"],
),
)
with pytest.warns(UserWarning):
s.unstack()
["a", "b", "c", "d"],
["x", "y", "z", "last"],
["i", "j", "k", "index"],
[1, 2, 3, 4],
]
)
elif len(pandas_series.index) == 100:
index = pd.MultiIndex.from_product(
[
["x", "y", "z", "last"],
["a", "b", "c", "d", "f"],
["i", "j", "k", "l", "index"],
]
)

modin_series = pd.Series(data[next(iter(data.keys()))], index=index)
pandas_series = pandas.Series(data[next(iter(data.keys()))], index=index)

df_equals(modin_series.unstack(), pandas_series.unstack())
df_equals(modin_series.unstack(level=0), pandas_series.unstack(level=0))
df_equals(modin_series.unstack(level=[0, 1]), pandas_series.unstack(level=[0, 1]))

if len(pandas_series.index) == 256:
df_equals(
modin_series.unstack(level=[0, 1, 2]),
pandas_series.unstack(level=[0, 1, 2]),
)


@pytest.mark.parametrize(
Expand Down

0 comments on commit 605deb4

Please sign in to comment.