FEAT-modin-project#1291 modin-project#1187: Add DataFrame.unstack, …

…`Series.unstack` (modin-project#1649) Signed-off-by: Alexey Prutskov <alexey.prutskov@intel.com>
aregm · Sep 16, 2020 · 605deb4 · 605deb4
1 parent 70c1278
commit 605deb4
Show file tree

Hide file tree

Showing 9 changed files with 244 additions and 30 deletions.
diff --git a/docs/supported_apis/dataframe_supported.rst b/docs/supported_apis/dataframe_supported.rst
@@ -430,7 +430,7 @@ default to pandas.
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``tz_localize``            | `tz_localize`_            | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
-| ``unstack``                | `unstack`_                | D                      |                                                    |
+| ``unstack``                | `unstack`_                | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+
 | ``update``                 | `update`_                 | Y                      |                                                    |
 +----------------------------+---------------------------+------------------------+----------------------------------------------------+

diff --git a/docs/supported_apis/series_supported.rst b/docs/supported_apis/series_supported.rst
@@ -462,7 +462,7 @@ the related section on `Defaulting to pandas`_.
 +-----------------------------+---------------------------------+
 | ``unique``                  | Y                               |
 +-----------------------------+---------------------------------+
-| ``unstack``                 | D                               |
+| ``unstack``                 | Y                               |
 +-----------------------------+---------------------------------+
 | ``update``                  | Y                               |
 +-----------------------------+---------------------------------+

diff --git a/modin/backends/base/query_compiler.py b/modin/backends/base/query_compiler.py
@@ -1194,6 +1194,10 @@ def groupby_agg(self, by, axis, agg_func, groupby_args, agg_args):
 
     # END Manual Partitioning methods
 
+    @abc.abstractmethod
+    def unstack(self, level, fill_value):
+        pass
+
     @abc.abstractmethod
     def get_dummies(self, columns, **kwargs):
         """Convert categorical variables to dummy variables for certain columns.

diff --git a/modin/backends/pandas/query_compiler.py b/modin/backends/pandas/query_compiler.py
@@ -793,7 +793,7 @@ def resample_app_ser(self, resample_args, func, *args, **kwargs):
             df_op=lambda df: df.squeeze(axis=1),
             func=func,
             *args,
-            **kwargs
+            **kwargs,
         )
 
     def resample_app_df(self, resample_args, func, *args, **kwargs):
@@ -806,7 +806,7 @@ def resample_agg_ser(self, resample_args, func, *args, **kwargs):
             df_op=lambda df: df.squeeze(axis=1),
             func=func,
             *args,
-            **kwargs
+            **kwargs,
         )
 
     def resample_agg_df(self, resample_args, func, *args, **kwargs):
@@ -851,7 +851,7 @@ def resample_interpolate(
         limit_direction,
         limit_area,
         downcast,
-        **kwargs
+        **kwargs,
     ):
         return self._resample_func(
             resample_args,
@@ -862,7 +862,7 @@ def resample_interpolate(
             limit_direction=limit_direction,
             limit_area=limit_area,
             downcast=downcast,
-            **kwargs
+            **kwargs,
         )
 
     def resample_count(self, resample_args):
@@ -910,7 +910,7 @@ def resample_ohlc_ser(self, resample_args, _method, *args, **kwargs):
             df_op=lambda df: df.squeeze(axis=1),
             _method=_method,
             *args,
-            **kwargs
+            **kwargs,
         )
 
     def resample_ohlc_df(self, resample_args, _method, *args, **kwargs):
@@ -1075,6 +1075,79 @@ def rolling_aggregate(self, rolling_args, func, *args, **kwargs):
         )
         return self.__constructor__(new_modin_frame)
 
+    def unstack(self, level, fill_value):
+        if not isinstance(self.index, pandas.MultiIndex) or (
+            isinstance(self.index, pandas.MultiIndex)
+            and is_list_like(level)
+            and len(level) == self.index.nlevels
+        ):
+            axis = 1
+            new_columns = ["__reduced__"]
+            need_reindex = True
+        else:
+            axis = 0
+            new_columns = None
+            need_reindex = False
+
+        def map_func(df):
+            return pandas.DataFrame(df.unstack(level=level, fill_value=fill_value))
+
+        is_all_multi_list = False
+        if (
+            isinstance(self.index, pandas.MultiIndex)
+            and isinstance(self.columns, pandas.MultiIndex)
+            and is_list_like(level)
+            and len(level) == self.index.nlevels
+        ):
+            is_all_multi_list = True
+            real_cols_bkp = self.columns
+            obj = self.copy()
+            obj.columns = np.arange(len(obj.columns))
+        else:
+            obj = self
+
+        new_modin_frame = obj._modin_frame._apply_full_axis(
+            axis, map_func, new_columns=new_columns
+        )
+        result = self.__constructor__(new_modin_frame)
+
+        if is_all_multi_list:
+            result = result.sort_index()
+            index_level_values = [lvl for lvl in obj.index.levels]
+            columns_level_values = [
+                real_cols_bkp.get_level_values(lvl).unique()
+                for lvl in np.arange(real_cols_bkp.nlevels)
+            ]
+            result.index = pandas.MultiIndex.from_product(
+                [*columns_level_values, *index_level_values]
+            )
+            return result
+
+        if need_reindex:
+            if isinstance(self.index, pandas.MultiIndex):
+                index_level_values = [
+                    self.index.get_level_values(lvl).unique()
+                    for lvl in np.arange(self.index.nlevels)
+                ]
+                new_index = pandas.MultiIndex.from_product(
+                    [self.columns, *index_level_values]
+                )
+            else:
+                if isinstance(self.columns, pandas.MultiIndex):
+                    columns_level_values = [
+                        self.columns.get_level_values(lvl).unique()
+                        for lvl in np.arange(self.columns.nlevels)
+                    ]
+                    new_index = pandas.MultiIndex.from_product(
+                        [*columns_level_values, self.index]
+                    )
+                else:
+                    new_index = pandas.MultiIndex.from_product(
+                        [self.columns, self.index]
+                    )
+            result = result.reindex(0, new_index)
+        return result
+
     # Map partitions operations
     # These operations are operations that apply a function to every partition.
     abs = MapFunction.register(pandas.DataFrame.abs, dtypes="copy")
@@ -1632,7 +1705,7 @@ def sort_index(self, **kwargs):
                 axis=axis,
                 level=level,
                 sort_remaining=sort_remaining,
-                **kwargs
+                **kwargs,
             )
 
         # sort_index can have ascending be None and behaves as if it is False.
@@ -2147,7 +2220,7 @@ def compute_groupby(df):
             try:
                 agg_func(
                     pandas.DataFrame(index=[1], columns=[1]).groupby(level=0),
-                    **agg_args
+                    **agg_args,
                 )
             except Exception as e:
                 raise type(e)("No numeric types to aggregate.")

diff --git a/modin/pandas/base.py b/modin/pandas/base.py
@@ -3333,9 +3333,6 @@ def tz_localize(
         )
         return self.set_axis(labels=new_labels, axis=axis, inplace=not copy)
 
-    def unstack(self, level=-1, fill_value=None):
-        return self._default_to_pandas("unstack", level=level, fill_value=fill_value)
-
     def var(
         self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs
     ):

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
@@ -1680,6 +1680,37 @@ def slice_shift(self, periods=1, axis=0):
                 new_df.columns = new_columns
                 return new_df
 
+    def unstack(self, level=-1, fill_value=None):
+        """
+        Pivot a level of the (necessarily hierarchical) index labels.
+        Returns a DataFrame having a new level of column labels whose inner-most level
+        consists of the pivoted index labels.
+        If the index is not a MultiIndex, the output will be a Series
+        (the analogue of stack when the columns are not a MultiIndex).
+        The level involved will automatically get sorted.
+        Parameters
+        ----------
+        level : int, str, or list of these, default -1 (last level)
+            Level(s) of index to unstack, can pass level name.
+        fill_value : int, str or dict
+            Replace NaN with this value if the unstack produces missing values.
+        Returns
+        -------
+        Series or DataFrame
+        """
+        if not isinstance(self.index, pandas.MultiIndex) or (
+            isinstance(self.index, pandas.MultiIndex)
+            and is_list_like(level)
+            and len(level) == self.index.nlevels
+        ):
+            return self._reduce_dimension(
+                query_compiler=self._query_compiler.unstack(level, fill_value)
+            )
+        else:
+            return DataFrame(
+                query_compiler=self._query_compiler.unstack(level, fill_value)
+            )
+
     def pivot(self, index=None, columns=None, values=None):
         return self._default_to_pandas(
             pandas.DataFrame.pivot, index=index, columns=columns, values=values

diff --git a/modin/pandas/series.py b/modin/pandas/series.py
@@ -1079,6 +1079,29 @@ def slice_shift(self, periods=1, axis=0):
                 )
             )
 
+    def unstack(self, level=-1, fill_value=None):
+        """
+        Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.
+        The level involved will automatically get sorted.
+        Parameters
+        ----------
+        level : int, str, or list of these, default last level
+            Level(s) to unstack, can pass level name.
+        fill_value : scalar value, default None
+            Value to use when replacing NaN values.
+        Returns
+        -------
+        DataFrame
+            Unstacked Series.
+        """
+        from .dataframe import DataFrame
+
+        result = DataFrame(
+            query_compiler=self._query_compiler.unstack(level, fill_value)
+        )
+
+        return result.droplevel(0, axis=1) if result.columns.nlevels > 1 else result
+
     @property
     def plot(
         self,

diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py
@@ -2950,10 +2950,78 @@ def test_tz_localize(self):
             pandas_df.tz_localize("America/Los_Angeles", axis=0),
         )
 
-    def test_unstack(self):
-        data = test_data_values[0]
-        with pytest.warns(UserWarning):
-            pd.DataFrame(data).unstack()
+    @pytest.mark.parametrize(
+        "is_multi_idx", [True, False], ids=["idx_multi", "idx_index"]
+    )
+    @pytest.mark.parametrize(
+        "is_multi_col", [True, False], ids=["col_multi", "col_index"]
+    )
+    @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+    def test_unstack(self, data, is_multi_idx, is_multi_col):
+        pandas_df = pandas.DataFrame(data)
+        modin_df = pd.DataFrame(data)
+
+        if is_multi_idx:
+            if len(pandas_df.index) == 256:
+                index = pd.MultiIndex.from_product(
+                    [
+                        ["a", "b", "c", "d"],
+                        ["x", "y", "z", "last"],
+                        ["i", "j", "k", "index"],
+                        [1, 2, 3, 4],
+                    ]
+                )
+            elif len(pandas_df.index) == 100:
+                index = pd.MultiIndex.from_product(
+                    [
+                        ["x", "y", "z", "last"],
+                        ["a", "b", "c", "d", "f"],
+                        ["i", "j", "k", "l", "index"],
+                    ]
+                )
+        else:
+            index = pandas_df.index
+
+        if is_multi_col:
+            if len(pandas_df.columns) == 64:
+                columns = pd.MultiIndex.from_product(
+                    [
+                        ["A", "B", "C", "D"],
+                        ["xx", "yy", "zz", "LAST"],
+                        [10, 20, 30, 40],
+                    ]
+                )
+            elif len(pandas_df.columns) == 100:
+                columns = pd.MultiIndex.from_product(
+                    [
+                        ["xx", "yy", "zz", "LAST"],
+                        ["A", "B", "C", "D", "F"],
+                        ["I", "J", "K", "L", "INDEX"],
+                    ]
+                )
+        else:
+            columns = pandas_df.columns
+
+        pandas_df.columns = columns
+        pandas_df.index = index
+
+        modin_df.columns = columns
+        modin_df.index = index
+
+        df_equals(modin_df.unstack(), pandas_df.unstack())
+
+        if is_multi_idx:
+            df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1))
+            df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1]))
+            df_equals(
+                modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2])
+            )
+
+            if len(pandas_df.index) == 256:
+                df_equals(
+                    modin_df.unstack(level=[0, 1, 2, 3]),
+                    pandas_df.unstack(level=[0, 1, 2, 3]),
+                )
 
     @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
     def test___array__(self, data):

diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
@@ -3073,21 +3073,39 @@ def test_unique(data):
     assert_array_equal(modin_result, pandas_result)
 
 
-def test_unstack():
-    s = pd.Series(
-        np.random.randint(1, 100, 12),
-        index=pd.MultiIndex.from_tuples(
+@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+def test_unstack(data):
+    modin_series, pandas_series = create_test_series(data)
+    if len(pandas_series.index) == 256:
+        index = pd.MultiIndex.from_product(
             [
-                (num, letter, color)
-                for num in range(1, 3)
-                for letter in ["a", "b", "c"]
-                for color in ["Red", "Green"]
-            ],
-            names=["Number", "Letter", "Color"],
-        ),
-    )
-    with pytest.warns(UserWarning):
-        s.unstack()
+                ["a", "b", "c", "d"],
+                ["x", "y", "z", "last"],
+                ["i", "j", "k", "index"],
+                [1, 2, 3, 4],
+            ]
+        )
+    elif len(pandas_series.index) == 100:
+        index = pd.MultiIndex.from_product(
+            [
+                ["x", "y", "z", "last"],
+                ["a", "b", "c", "d", "f"],
+                ["i", "j", "k", "l", "index"],
+            ]
+        )
+
+    modin_series = pd.Series(data[next(iter(data.keys()))], index=index)
+    pandas_series = pandas.Series(data[next(iter(data.keys()))], index=index)
+
+    df_equals(modin_series.unstack(), pandas_series.unstack())
+    df_equals(modin_series.unstack(level=0), pandas_series.unstack(level=0))
+    df_equals(modin_series.unstack(level=[0, 1]), pandas_series.unstack(level=[0, 1]))
+
+    if len(pandas_series.index) == 256:
+        df_equals(
+            modin_series.unstack(level=[0, 1, 2]),
+            pandas_series.unstack(level=[0, 1, 2]),
+        )
 
 
 @pytest.mark.parametrize(