modin-project · mvashishtha · Jul 12, 2022 · Apr 25, 2022 · Apr 25, 2022 · Apr 26, 2022
@@ -39,4 +39,5 @@ Contributors
 @mvashishtha
 @NickCrews
 @prutskov
+@vnlitvinov
 @RehanSD
@@ -15,6 +15,7 @@
 
 from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
 from ..partitioning.partition_manager import PandasOnDaskDataframePartitionManager
+from modin.core.execution.dask.common.engine_wrapper import DaskWrapper
 
 
 class PandasOnDaskDataframe(PandasDataframe):
@@ -41,22 +42,63 @@ class PandasOnDaskDataframe(PandasDataframe):
 
     _partition_mgr_cls = PandasOnDaskDataframePartitionManager
 
+    def _get_partition_size_along_axis(self, partition, axis=0):
+        """
+        Compute the length along the specified axis of the specified partition.
+
+        Parameters
+        ----------
+        partition : ``PandasOnDaskDataframeVirtualPartition`` or ``PandasOnDaskDataframePartition``
+            The partition whose size to compute.
+        axis : int, default: 0
+            The axis along which to compute size.
+
+        Returns
+        -------
+        list
+            A list of lengths along the specified axis that sum to the overall length of the partition
+            along the specified axis.
+
+        Notes
+        -----
+        This utility function is used to ensure that computation occurs asynchronously across all partitions
+        whether the partitions are virtual or physical partitions.
+        """
+        if isinstance(partition, self._partition_mgr_cls._partition_class):
+            return [
+                partition.apply(
+                    lambda df: len(df) if not axis else len(df.columns)
+                )._data
+            ]
+        elif partition.axis == axis:
+            return [
+                ptn.apply(lambda df: len(df) if not axis else len(df.columns))._data
+                for ptn in partition.list_of_partitions_to_combine
+            ]
+        return [
+            partition.list_of_partitions_to_combine[0]
+            .apply(lambda df: len(df) if not axis else (len(df.columns)))
+            ._data
+        ]
+
     @property
     def _row_lengths(self):
         """
-        Compute the row partitions lengths if they are not cached.
+        Compute ther row partitions lengths if they are not cached.
-        Compute ther row partitions lengths if they are not cached.
+        Compute the row partition lengths if they are not cached.
-        Compute ther row partitions lengths if they are not cached.
+        Compute the row partition lengths if they are not cached.
 
         Returns
         -------
         list
             A list of row partitions lengths.
         """
         if self._row_lengths_cache is None:
-            self._row_lengths_cache = (
-                self._partition_mgr_cls.get_objects_from_partitions(
-                    [obj.apply(lambda df: len(df)) for obj in self._partitions.T[0]]
-                )
+            row_lengths_list = DaskWrapper.materialize(
+                [
+                    self._get_partition_size_along_axis(obj, axis=0)
+                    for obj in self._partitions.T[0]
+                ]
             )
+            self._row_lengths_cache = [sum(len_list) for len_list in row_lengths_list]
         return self._row_lengths_cache
 
     @property
@@ -70,12 +112,13 @@ def _column_widths(self):
             A list of column partitions widths.
         """
         if self._column_widths_cache is None:
-            self._column_widths_cache = (
-                self._partition_mgr_cls.get_objects_from_partitions(
-                    [
-                        obj.apply(lambda df: len(df.columns))
-                        for obj in self._partitions[0]
-                    ]
-                )
+            col_widths_list = DaskWrapper.materialize(
+                [
+                    self._get_partition_size_along_axis(obj, axis=1)
+                    for obj in self._partitions[0]
+                ]
             )
+            self._column_widths_cache = [
+                sum(width_list) for width_list in col_widths_list
+            ]
         return self._column_widths_cache
@@ -1982,7 +1982,7 @@ def test_groupby_with_virtual_partitions():
 
     # Check that the constructed Modin DataFrame has virtual partitions when
     # using Ray or Dask, and doesn't when using another execution engines.
-    if Engine.get() == "Ray" or Engine.get() == "Dask":
+    if Engine.get() in ["Ray", "Dask"]:
         assert issubclass(
             type(big_modin_df._query_compiler._modin_frame._partitions[0][0]),
             PandasDataframeAxisPartition,

@@ -112,36 +112,77 @@ def func_to_apply(partition, row_internal_indices, col_internal_indices, item):
     df_equals(md_df, pd_df)
 
 
+small_dfs = [
+    pd.DataFrame(
+        [[i + j for j in range(0, 1000)]],
+        columns=[f"col{j}" for j in range(1, 1001)],
+        index=pd.Index([i - 1]),
+    )
+    for i in range(1, 100001, 1000)
+]
-small_dfs = [
-    pd.DataFrame(
-        [[i + j for j in range(0, 1000)]],
-        columns=[f"col{j}" for j in range(1, 1001)],
-        index=pd.Index([i - 1]),
-    )
-    for i in range(1, 100001, 1000)
-]
+small_dfs = [
+    pd.DataFrame([[i + j for j in range(0, 1000)]]).add_prefix('col')
+    for i in range(100 * 1000, 1000)
+]
-small_dfs = [
-    pd.DataFrame(
-        [[i + j for j in range(0, 1000)]],
-        columns=[f"col{j}" for j in range(1, 1001)],
-        index=pd.Index([i - 1]),
-    )
-    for i in range(1, 100001, 1000)
-]
+small_dfs = [
+    pd.DataFrame([[i + j for j in range(0, 1000)]]).add_prefix('col')
+    for i in range(100 * 1000, 1000)
+]
+large_df = pd.DataFrame(
+    [[i + j for j in range(1, 1000)] for i in range(0, 100000, 1000)],
+    columns=[f"col{j}" for j in range(1, 1000)],
+    index=pd.Index(list(range(0, 100000, 1000))),
+)
-large_df = pd.DataFrame(
-    [[i + j for j in range(1, 1000)] for i in range(0, 100000, 1000)],
-    columns=[f"col{j}" for j in range(1, 1000)],
-    index=pd.Index(list(range(0, 100000, 1000))),
-)
+large_df = pd.DataFrame(
+    [[i + j for j in range(1000)] for i in range(0, 100 * 1000, 1000)]]
+).add_prefix('col')
-large_df = pd.DataFrame(
-    [[i + j for j in range(1, 1000)] for i in range(0, 100000, 1000)],
-    columns=[f"col{j}" for j in range(1, 1000)],
-    index=pd.Index(list(range(0, 100000, 1000))),
-)
+large_df = pd.DataFrame(
+    [[i + j for j in range(1000)] for i in range(0, 100 * 1000, 1000)]]
+).add_prefix('col')
+
+
 @pytest.mark.skipif(
-    Engine.get() != "Dask" and Engine.get() != "Ray",
+    Engine.get() not in ("Dask", "Ray"),
     reason="Rebalancing partitions is only supported for Dask and Ray engines",
 )
-def test_rebalance_partitions():
-    small_dfs = [
-        pd.DataFrame(
-            [[i + j for j in range(0, 100)]],
-            columns=[f"col{j}" for j in range(1, 101)],
-            index=pd.Index([i - 1]),
-        )
-        for i in range(1, 10001, 100)
-    ]
-    large_df = pd.concat(small_dfs)
+@pytest.mark.parametrize(
+    "large_df,col_length",
+    [
+        (pd.concat(small_dfs), 100),
+        (pd.concat([pd.concat(small_dfs)] + small_dfs[:3]), 103),
+        (pd.concat([large_df] + small_dfs[:3]), 103),
+    ],
+)
+def test_rebalance_partitions(large_df, col_length):
     large_modin_frame = large_df._query_compiler._modin_frame
     assert large_modin_frame._partitions.shape == (
-        4,
-        4,
+        NPartitions.get(),
+        NPartitions.get(),
     ), "Partitions were not rebalanced after concat."
     assert all(
         isinstance(ptn, large_modin_frame._partition_mgr_cls._column_partitions_class)
         for ptn in large_modin_frame._partitions.flatten()
     )
-    large_df = large_df.apply(lambda x: x + 1)
-    large_modin_frame = large_df._query_compiler._modin_frame
-    assert large_modin_frame._partitions.shape == (
+    # The following check tests that we can correctly form full-axis virtual partitions
+    # over the orthogonal axis from non-full-axis virtual partitions.
+
+    def col_apply_func(col):
+        assert len(col) == col_length, "Partial axis partition detected."
+        return col + 1
+
+    large_df = large_df.apply(col_apply_func)
+    new_large_modin_frame = large_df._query_compiler._modin_frame
+    assert new_large_modin_frame._partitions.shape == (
+        NPartitions.get(),
+        NPartitions.get(),
+    ), "Partitions list shape is incorrect."
+    assert all(
+        isinstance(ptn, new_large_modin_frame._partition_mgr_cls._partition_class)
+        for ptn in new_large_modin_frame._partitions.flatten()
+    ), "Partitions are not block partitioned after apply."
+    large_df = pd.DataFrame(
+        query_compiler=large_df._query_compiler.__constructor__(large_modin_frame)
+    )
+    # The following check tests that we can correctly form full-axis virtual partitions
+    # over the same axis from non-full-axis virtual partitions.
+
+    def row_apply_func(row):
+        assert len(row) == 1000, "Partial axis partition detected."
+        return row + 1
+
+    large_df = large_df.apply(row_apply_func, axis=1)
+    new_large_modin_frame = large_df._query_compiler._modin_frame
+    assert new_large_modin_frame._partitions.shape == (
         4,
         4,
-    ), "Partitions are not block partitioned after apply."
+    ), "Partitions list shape is incorrect."
     assert all(
-        isinstance(ptn, large_modin_frame._partition_mgr_cls._partition_class)
-        for ptn in large_modin_frame._partitions.flatten()
-    )
+        isinstance(ptn, new_large_modin_frame._partition_mgr_cls._partition_class)
+        for ptn in new_large_modin_frame._partitions.flatten()
+    ), "Partitions are not block partitioned after apply."