modin-project · mvashishtha · Jul 12, 2022 · Apr 25, 2022 · Apr 25, 2022 · Apr 26, 2022
@@ -34,15 +34,15 @@ specifically for the `PandasOnDask` execution.
 
 * :doc:`PandasOnDaskDataframe <dataframe>`
 * :doc:`PandasOnDaskDataframePartition <partitioning/partition>`
-* :doc:`PandasOnDaskDataframeAxisPartition <partitioning/axis_partition>`
+* :doc:`PandasOnDaskDataframeVirtualPartition <partitioning/virtual_partition>`
 * :doc:`PandasOnDaskDataframePartitionManager <partitioning/partition_manager>`
 
 .. toctree::
     :hidden:
 
     dataframe
     partitioning/partition
-    partitioning/axis_partition
+    partitioning/virtual_partition
     partitioning/partition_manager
 
 
@@ -80,4 +80,4 @@ the user query to execute it on Dask workers. Then, the :py:class:`~modin.core.e
 that will be written into the file in parallel in Dask workers.
 
 .. note::
-   Currently, data egress uses default `pandas` implementation for `pandas on Dask` execution.
+   Currently, data egress uses default `pandas` implementation for `pandas on Dask` execution.
@@ -1,14 +1,14 @@
-PandasOnDaskDataframeAxisPartition
-""""""""""""""""""""""""""""""""""
+PandasOnDaskDataframeVirtualPartition
+"""""""""""""""""""""""""""""""""""""
 
-The class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.virtual_partition.PandasDataframeAxisPartition`,
+The class is the specific implementation of :py:class:`~modin.core.dataframe.pandas.partitioning.virtual_partition.PandasOnDaskDataframeVirtualPartition`,
 providing the API to perform operations on an axis (column or row) partition using Dask as the execution engine.
 The axis partition is a wrapper over a list of block partitions that are stored in this class.
 
 Public API
 ----------
 
-.. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.partitioning.virtual_partition.PandasOnDaskDataframeAxisPartition
+.. autoclass:: modin.core.execution.dask.implementations.pandas_on_dask.partitioning.virtual_partition.PandasOnDaskDataframeVirtualPartition
   :members:
 
 PandasOnDaskDataframeColumnPartition

@@ -50,6 +50,8 @@ Key Features and Updates
   * FEAT-#4619: Integrate mypy static type checking (#4620)
 * New Features
   * FEAT-4463: Add experimental fuzzydata integration for testing against a randomized dataframe workflow (#4556)
+  * FEAT-#4419: Extend virtual partitioning API to pandas on Dask (#4420)
+
 
 Contributors
 ------------
@@ -62,4 +64,4 @@ Contributors
 @RehanSD
 @helmeleegy
 @anmyachev
-@d33bs
+@d33bs
@@ -27,7 +27,7 @@
 from modin.error_message import ErrorMessage
 from modin.core.storage_formats.pandas.utils import compute_chunksize
 from modin.core.dataframe.pandas.utils import concatenate
-from modin.config import NPartitions, ProgressBar, BenchmarkMode
+from modin.config import NPartitions, ProgressBar, BenchmarkMode, Engine, StorageFormat
 
 import os
 
@@ -615,11 +615,15 @@ def concat(cls, axis, left_parts, right_parts):
             to_concat = (
                 [left_parts] + right_parts if left_parts.size != 0 else right_parts
             )
-            return (
+            result = (
                 np.concatenate(to_concat, axis=axis) if len(to_concat) else left_parts
             )
         else:
-            return np.append(left_parts, right_parts, axis=axis)
+            result = np.append(left_parts, right_parts, axis=axis)
+        if axis == 0:
+            return cls.rebalance_partitions(result)
+        else:
+            return result
 
     @classmethod
     def to_pandas(cls, partitions):
@@ -1292,7 +1296,15 @@ def finalize(cls, partitions):
     @classmethod
     def rebalance_partitions(cls, partitions):
         """
-        Return the provided array of partitions without rebalancing it.
+        Rebalance a 2-d array of partitions if we are using ``PandasOnRay`` or ``PandasOnDask`` executions.
+
+        For all other executions, the partitions are returned unchanged.
+
+        Rebalance the partitions by building a new array
+        of partitions out of the original ones so that:
+
+        - If all partitions have a length, each new partition has roughly the same number of rows.
+        - Otherwise, each new partition spans roughly the same number of old partitions.
 
         Parameters
         ----------
@@ -1302,6 +1314,103 @@ def rebalance_partitions(cls, partitions):
         Returns
         -------
         np.ndarray
-            The same 2-d array.
+            A NumPy array with the same; or new, rebalanced, partitions, depending on the execution
+            engine and storage format.
         """
+        if Engine.get() in ["Ray", "Dask"] and StorageFormat.get() == "Pandas":
+            # Rebalancing partitions is currently only implemented for PandasOnRay and PandasOnDask.
+            # We rebalance when the ratio of the number of existing partitions to
+            # the ideal number of partitions is larger than this threshold. The
+            # threshold is a heuristic that may need to be tuned for performance.
+            max_excess_of_num_partitions = 1.5
+            num_existing_partitions = partitions.shape[0]
+            ideal_num_new_partitions = NPartitions.get()
+            if (
+                num_existing_partitions
+                <= ideal_num_new_partitions * max_excess_of_num_partitions
+            ):
+                return partitions
+            # If any partition has an unknown length, give each axis partition
+            # roughly the same number of row partitions. We use `_length_cache` here
+            # to avoid materializing any unmaterialized lengths.
+            if any(
+                partition._length_cache is None
+                for row in partitions
+                for partition in row
+            ):
+                # We need each partition to go into an axis partition, but the
+                # number of axis partitions may not evenly divide the number of
+                # partitions.
+                chunk_size = compute_chunksize(
+                    num_existing_partitions, ideal_num_new_partitions, min_block_size=1
+                )
+                return np.array(
+                    [
+                        cls.column_partitions(
+                            partitions[i : i + chunk_size],
+                            full_axis=False,
+                        )
+                        for i in range(
+                            0,
+                            num_existing_partitions,
+                            chunk_size,
+                        )
+                    ]
+                )
+
+            # If we know the number of rows in every partition, then we should try
+            # instead to give each new partition roughly the same number of rows.
+            new_partitions = []
+            # `start` is the index of the first existing partition that we want to
+            # put into the current new partition.
+            start = 0
+            total_rows = sum(part.length() for part in partitions[:, 0])
+            ideal_partition_size = compute_chunksize(
+                total_rows, ideal_num_new_partitions, min_block_size=1
+            )
+            for _ in range(ideal_num_new_partitions):
+                # We might pick up old partitions too quickly and exhaust all of them.
+                if start >= len(partitions):
+                    break
+                # `stop` is the index of the last existing partition so far that we
+                # want to put into the current new partition.
+                stop = start
+                partition_size = partitions[start][0].length()
+                # Add existing partitions into the current new partition until the
+                # number of rows in the new partition hits `ideal_partition_size`.
+                while stop < len(partitions) and partition_size < ideal_partition_size:
+                    stop += 1
+                    if stop < len(partitions):
+                        partition_size += partitions[stop][0].length()
+                # If the new partition is larger than we want, split the last
+                # current partition that it contains into two partitions, where
+                # the first partition has just enough rows to make the current
+                # new partition have length `ideal_partition_size`, and the second
+                # partition has the remainder.
+                if partition_size > ideal_partition_size * max_excess_of_num_partitions:
+                    new_last_partition_size = ideal_partition_size - sum(
+                        row[0].length() for row in partitions[start:stop]
+                    )
+                    partitions = np.insert(
+                        partitions,
+                        stop + 1,
+                        [
+                            obj.mask(slice(new_last_partition_size, None), slice(None))
+                            for obj in partitions[stop]
+                        ],
+                        0,
+                    )
+                    partitions[stop, :] = [
+                        obj.mask(slice(None, new_last_partition_size), slice(None))
+                        for obj in partitions[stop]
+                    ]
+                    partition_size = ideal_partition_size
+                new_partitions.append(
+                    cls.column_partitions(
+                        (partitions[start : stop + 1]),
+                        full_axis=partition_size == total_rows,
+                    )
+                )
+                start = stop + 1
+            return np.array(new_partitions)
         return partitions
@@ -15,6 +15,7 @@
 
 from modin.core.dataframe.pandas.dataframe.dataframe import PandasDataframe
 from ..partitioning.partition_manager import PandasOnDaskDataframePartitionManager
+from modin.core.execution.dask.common.engine_wrapper import DaskWrapper
 
 
 class PandasOnDaskDataframe(PandasDataframe):
@@ -41,22 +42,63 @@ class PandasOnDaskDataframe(PandasDataframe):
 
     _partition_mgr_cls = PandasOnDaskDataframePartitionManager
 
+    def _get_partition_size_along_axis(self, partition, axis=0):
+        """
+        Compute the length along the specified axis of the specified partition.
+
+        Parameters
+        ----------
+        partition : ``PandasOnDaskDataframeVirtualPartition`` or ``PandasOnDaskDataframePartition``
+            The partition whose size to compute.
+        axis : int, default: 0
+            The axis along which to compute size.
+
+        Returns
+        -------
+        list
+            A list of lengths along the specified axis that sum to the overall length of the partition
+            along the specified axis.
+
+        Notes
+        -----
+        This utility function is used to ensure that computation occurs asynchronously across all partitions
+        whether the partitions are virtual or physical partitions.
+        """
+        if isinstance(partition, self._partition_mgr_cls._partition_class):
+            return [
+                partition.apply(
+                    lambda df: len(df) if not axis else len(df.columns)
+                )._data
+            ]
+        elif partition.axis == axis:
+            return [
+                ptn.apply(lambda df: len(df) if not axis else len(df.columns))._data
+                for ptn in partition.list_of_partitions_to_combine
+            ]
+        return [
+            partition.list_of_partitions_to_combine[0]
+            .apply(lambda df: len(df) if not axis else (len(df.columns)))
+            ._data
+        ]
+
     @property
     def _row_lengths(self):
         """
-        Compute the row partitions lengths if they are not cached.
+        Compute ther row partitions lengths if they are not cached.
-        Compute ther row partitions lengths if they are not cached.
+        Compute the row partition lengths if they are not cached.
-        Compute ther row partitions lengths if they are not cached.
+        Compute the row partition lengths if they are not cached.
 
         Returns
         -------
         list
             A list of row partitions lengths.
         """
         if self._row_lengths_cache is None:
-            self._row_lengths_cache = (
-                self._partition_mgr_cls.get_objects_from_partitions(
-                    [obj.apply(lambda df: len(df)) for obj in self._partitions.T[0]]
-                )
+            row_lengths_list = DaskWrapper.materialize(
+                [
+                    self._get_partition_size_along_axis(obj, axis=0)
+                    for obj in self._partitions.T[0]
+                ]
             )
+            self._row_lengths_cache = [sum(len_list) for len_list in row_lengths_list]
         return self._row_lengths_cache
 
     @property
@@ -70,12 +112,13 @@ def _column_widths(self):
             A list of column partitions widths.
         """
         if self._column_widths_cache is None:
-            self._column_widths_cache = (
-                self._partition_mgr_cls.get_objects_from_partitions(
-                    [
-                        obj.apply(lambda df: len(df.columns))
-                        for obj in self._partitions[0]
-                    ]
-                )
+            col_widths_list = DaskWrapper.materialize(
+                [
+                    self._get_partition_size_along_axis(obj, axis=1)
+                    for obj in self._partitions[0]
+                ]
             )
+            self._column_widths_cache = [
+                sum(width_list) for width_list in col_widths_list
+            ]
         return self._column_widths_cache