From 605deb43e543feb6dc9446ed831e8e15dd3a9975 Mon Sep 17 00:00:00 2001 From: Alexey Prutskov Date: Fri, 28 Aug 2020 14:14:25 +0300 Subject: [PATCH] FEAT-#1291 #1187: Add `DataFrame.unstack`, `Series.unstack` (#1649) Signed-off-by: Alexey Prutskov --- docs/supported_apis/dataframe_supported.rst | 2 +- docs/supported_apis/series_supported.rst | 2 +- modin/backends/base/query_compiler.py | 4 + modin/backends/pandas/query_compiler.py | 87 +++++++++++++++++++-- modin/pandas/base.py | 3 - modin/pandas/dataframe.py | 31 ++++++++ modin/pandas/series.py | 23 ++++++ modin/pandas/test/test_dataframe.py | 76 +++++++++++++++++- modin/pandas/test/test_series.py | 46 +++++++---- 9 files changed, 244 insertions(+), 30 deletions(-) diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst index 1ad585ce228..5725788b2fb 100644 --- a/docs/supported_apis/dataframe_supported.rst +++ b/docs/supported_apis/dataframe_supported.rst @@ -430,7 +430,7 @@ default to pandas. +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``tz_localize`` | `tz_localize`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ -| ``unstack`` | `unstack`_ | D | | +| ``unstack`` | `unstack`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ | ``update`` | `update`_ | Y | | +----------------------------+---------------------------+------------------------+----------------------------------------------------+ diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst index 962aef16f87..3c89d23ade6 100644 --- a/docs/supported_apis/series_supported.rst +++ b/docs/supported_apis/series_supported.rst @@ -462,7 +462,7 @@ the related section on `Defaulting to pandas`_. +-----------------------------+---------------------------------+ | ``unique`` | Y | +-----------------------------+---------------------------------+ -| ``unstack`` | D | +| ``unstack`` | Y | +-----------------------------+---------------------------------+ | ``update`` | Y | +-----------------------------+---------------------------------+ diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py index 1f4d6d596f0..c13be057439 100644 --- a/modin/backends/base/query_compiler.py +++ b/modin/backends/base/query_compiler.py @@ -1194,6 +1194,10 @@ def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args): # END Manual Partitioning methods + @abc.abstractmethod + def unstack(self, level, fill_value): + pass + @abc.abstractmethod def get_dummies(self, columns, **kwargs): """Convert categorical variables to dummy variables for certain columns. diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py index 490cdf8db3f..a5493311ffa 100644 --- a/modin/backends/pandas/query_compiler.py +++ b/modin/backends/pandas/query_compiler.py @@ -793,7 +793,7 @@ def resample_app_ser(self, resample_args, func, *args, **kwargs): df_op=lambda df: df.squeeze(axis=1), func=func, *args, - **kwargs + **kwargs, ) def resample_app_df(self, resample_args, func, *args, **kwargs): @@ -806,7 +806,7 @@ def resample_agg_ser(self, resample_args, func, *args, **kwargs): df_op=lambda df: df.squeeze(axis=1), func=func, *args, - **kwargs + **kwargs, ) def resample_agg_df(self, resample_args, func, *args, **kwargs): @@ -851,7 +851,7 @@ def resample_interpolate( limit_direction, limit_area, downcast, - **kwargs + **kwargs, ): return self._resample_func( resample_args, @@ -862,7 +862,7 @@ def resample_interpolate( limit_direction=limit_direction, limit_area=limit_area, downcast=downcast, - **kwargs + **kwargs, ) def resample_count(self, resample_args): @@ -910,7 +910,7 @@ def resample_ohlc_ser(self, resample_args, _method, *args, **kwargs): df_op=lambda df: df.squeeze(axis=1), _method=_method, *args, - **kwargs + **kwargs, ) def resample_ohlc_df(self, resample_args, _method, *args, **kwargs): @@ -1075,6 +1075,79 @@ def rolling_aggregate(self, rolling_args, func, *args, **kwargs): ) return self.__constructor__(new_modin_frame) + def unstack(self, level, fill_value): + if not isinstance(self.index, pandas.MultiIndex) or ( + isinstance(self.index, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.index.nlevels + ): + axis = 1 + new_columns = ["__reduced__"] + need_reindex = True + else: + axis = 0 + new_columns = None + need_reindex = False + + def map_func(df): + return pandas.DataFrame(df.unstack(level=level, fill_value=fill_value)) + + is_all_multi_list = False + if ( + isinstance(self.index, pandas.MultiIndex) + and isinstance(self.columns, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.index.nlevels + ): + is_all_multi_list = True + real_cols_bkp = self.columns + obj = self.copy() + obj.columns = np.arange(len(obj.columns)) + else: + obj = self + + new_modin_frame = obj._modin_frame._apply_full_axis( + axis, map_func, new_columns=new_columns + ) + result = self.__constructor__(new_modin_frame) + + if is_all_multi_list: + result = result.sort_index() + index_level_values = [lvl for lvl in obj.index.levels] + columns_level_values = [ + real_cols_bkp.get_level_values(lvl).unique() + for lvl in np.arange(real_cols_bkp.nlevels) + ] + result.index = pandas.MultiIndex.from_product( + [*columns_level_values, *index_level_values] + ) + return result + + if need_reindex: + if isinstance(self.index, pandas.MultiIndex): + index_level_values = [ + self.index.get_level_values(lvl).unique() + for lvl in np.arange(self.index.nlevels) + ] + new_index = pandas.MultiIndex.from_product( + [self.columns, *index_level_values] + ) + else: + if isinstance(self.columns, pandas.MultiIndex): + columns_level_values = [ + self.columns.get_level_values(lvl).unique() + for lvl in np.arange(self.columns.nlevels) + ] + new_index = pandas.MultiIndex.from_product( + [*columns_level_values, self.index] + ) + else: + new_index = pandas.MultiIndex.from_product( + [self.columns, self.index] + ) + result = result.reindex(0, new_index) + return result + # Map partitions operations # These operations are operations that apply a function to every partition. abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy") @@ -1632,7 +1705,7 @@ def sort_index(self, **kwargs): axis=axis, level=level, sort_remaining=sort_remaining, - **kwargs + **kwargs, ) # sort_index can have ascending be None and behaves as if it is False. @@ -2147,7 +2220,7 @@ def compute_groupby(df): try: agg_func( pandas.DataFrame(index=[1], columns=[1]).groupby(level=0), - **agg_args + **agg_args, ) except Exception as e: raise type(e)("No numeric types to aggregate.") diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 00fb5123648..71450b75182 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3333,9 +3333,6 @@ def tz_localize( ) return self.set_axis(labels=new_labels, axis=axis, inplace=not copy) - def unstack(self, level=-1, fill_value=None): - return self._default_to_pandas("unstack", level=level, fill_value=fill_value) - def var( self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs ): diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index a03481f3ac7..cf1700c01c5 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -1680,6 +1680,37 @@ def slice_shift(self, periods=1, axis=0): new_df.columns = new_columns return new_df + def unstack(self, level=-1, fill_value=None): + """ + Pivot a level of the (necessarily hierarchical) index labels. + Returns a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + The level involved will automatically get sorted. + Parameters + ---------- + level : int, str, or list of these, default -1 (last level) + Level(s) of index to unstack, can pass level name. + fill_value : int, str or dict + Replace NaN with this value if the unstack produces missing values. + Returns + ------- + Series or DataFrame + """ + if not isinstance(self.index, pandas.MultiIndex) or ( + isinstance(self.index, pandas.MultiIndex) + and is_list_like(level) + and len(level) == self.index.nlevels + ): + return self._reduce_dimension( + query_compiler=self._query_compiler.unstack(level, fill_value) + ) + else: + return DataFrame( + query_compiler=self._query_compiler.unstack(level, fill_value) + ) + def pivot(self, index=None, columns=None, values=None): return self._default_to_pandas( pandas.DataFrame.pivot, index=index, columns=columns, values=values diff --git a/modin/pandas/series.py b/modin/pandas/series.py index c2c308be80d..939806c3bda 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -1079,6 +1079,29 @@ def slice_shift(self, periods=1, axis=0): ) ) + def unstack(self, level=-1, fill_value=None): + """ + Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. + The level involved will automatically get sorted. + Parameters + ---------- + level : int, str, or list of these, default last level + Level(s) to unstack, can pass level name. + fill_value : scalar value, default None + Value to use when replacing NaN values. + Returns + ------- + DataFrame + Unstacked Series. + """ + from .dataframe import DataFrame + + result = DataFrame( + query_compiler=self._query_compiler.unstack(level, fill_value) + ) + + return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result + @property def plot( self, diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 7362b137faf..b96a1eef2b6 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -2950,10 +2950,78 @@ def test_tz_localize(self): pandas_df.tz_localize("America/Los_Angeles", axis=0), ) - def test_unstack(self): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).unstack() + @pytest.mark.parametrize( + "is_multi_idx", [True, False], ids=["idx_multi", "idx_index"] + ) + @pytest.mark.parametrize( + "is_multi_col", [True, False], ids=["col_multi", "col_index"] + ) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) + def test_unstack(self, data, is_multi_idx, is_multi_col): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + if is_multi_idx: + if len(pandas_df.index) == 256: + index = pd.MultiIndex.from_product( + [ + ["a", "b", "c", "d"], + ["x", "y", "z", "last"], + ["i", "j", "k", "index"], + [1, 2, 3, 4], + ] + ) + elif len(pandas_df.index) == 100: + index = pd.MultiIndex.from_product( + [ + ["x", "y", "z", "last"], + ["a", "b", "c", "d", "f"], + ["i", "j", "k", "l", "index"], + ] + ) + else: + index = pandas_df.index + + if is_multi_col: + if len(pandas_df.columns) == 64: + columns = pd.MultiIndex.from_product( + [ + ["A", "B", "C", "D"], + ["xx", "yy", "zz", "LAST"], + [10, 20, 30, 40], + ] + ) + elif len(pandas_df.columns) == 100: + columns = pd.MultiIndex.from_product( + [ + ["xx", "yy", "zz", "LAST"], + ["A", "B", "C", "D", "F"], + ["I", "J", "K", "L", "INDEX"], + ] + ) + else: + columns = pandas_df.columns + + pandas_df.columns = columns + pandas_df.index = index + + modin_df.columns = columns + modin_df.index = index + + df_equals(modin_df.unstack(), pandas_df.unstack()) + + if is_multi_idx: + df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1)) + df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1])) + df_equals( + modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2]) + ) + + if len(pandas_df.index) == 256: + df_equals( + modin_df.unstack(level=[0, 1, 2, 3]), + pandas_df.unstack(level=[0, 1, 2, 3]), + ) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test___array__(self, data): diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 1a3461fd4b9..b866e585ef3 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -3073,21 +3073,39 @@ def test_unique(data): assert_array_equal(modin_result, pandas_result) -def test_unstack(): - s = pd.Series( - np.random.randint(1, 100, 12), - index=pd.MultiIndex.from_tuples( +@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +def test_unstack(data): + modin_series, pandas_series = create_test_series(data) + if len(pandas_series.index) == 256: + index = pd.MultiIndex.from_product( [ - (num, letter, color) - for num in range(1, 3) - for letter in ["a", "b", "c"] - for color in ["Red", "Green"] - ], - names=["Number", "Letter", "Color"], - ), - ) - with pytest.warns(UserWarning): - s.unstack() + ["a", "b", "c", "d"], + ["x", "y", "z", "last"], + ["i", "j", "k", "index"], + [1, 2, 3, 4], + ] + ) + elif len(pandas_series.index) == 100: + index = pd.MultiIndex.from_product( + [ + ["x", "y", "z", "last"], + ["a", "b", "c", "d", "f"], + ["i", "j", "k", "l", "index"], + ] + ) + + modin_series = pd.Series(data[next(iter(data.keys()))], index=index) + pandas_series = pandas.Series(data[next(iter(data.keys()))], index=index) + + df_equals(modin_series.unstack(), pandas_series.unstack()) + df_equals(modin_series.unstack(level=0), pandas_series.unstack(level=0)) + df_equals(modin_series.unstack(level=[0, 1]), pandas_series.unstack(level=[0, 1])) + + if len(pandas_series.index) == 256: + df_equals( + modin_series.unstack(level=[0, 1, 2]), + pandas_series.unstack(level=[0, 1, 2]), + ) @pytest.mark.parametrize(