ray-project · ericl · Jun 22, 2023 · Jun 12, 2023 · Jun 12, 2023 · Jun 13, 2023
@@ -32,6 +32,7 @@ def __init__(
         # Whether the block list is owned by consuming APIs, and if so it can be
         # eagerly deleted after read by the consumer.
         self._owned_by_consumer = owned_by_consumer
+        self._estimated_num_blocks = None
 
     def __repr__(self):
         return f"BlockList(owned_by_consumer={self._owned_by_consumer})"
@@ -217,6 +218,10 @@ def initial_num_blocks(self) -> int:
         """Returns the number of blocks of this BlockList."""
         return self._num_blocks
 
+    def estimated_num_blocks(self) -> int:
+        """"""
+        return self._estimated_num_blocks or self._num_blocks
+
     def executed_num_blocks(self) -> int:
         """Returns the number of output blocks after execution.
 

@@ -19,6 +19,7 @@ def __init__(
         self,
         input_data: Optional[List[RefBundle]] = None,
         input_data_factory: Callable[[], List[RefBundle]] = None,
+        override_num_blocks: Optional[int] = None,
     ):
         """Create an InputDataBuffer.
 
@@ -37,6 +38,7 @@ def __init__(
             assert input_data_factory is not None
             self._input_data_factory = input_data_factory
             self._is_input_initialized = False
+        self._override_num_blocks = override_num_blocks
         super().__init__("Input", [])
 
     def start(self, options: ExecutionOptions) -> None:
@@ -53,7 +55,7 @@ def get_next(self) -> RefBundle:
         return self._input_data.pop(0)
 
     def num_outputs_total(self) -> Optional[int]:
-        return self._num_outputs
+        return self._override_num_blocks or self._num_outputs
 
     def get_stats(self) -> StatsDict:
         return {}

@@ -11,8 +11,17 @@ def __init__(
         self,
         datasource: Datasource,
         read_tasks: List[ReadTask],
+        estimated_num_blocks: int,
         ray_remote_args: Optional[Dict[str, Any]] = None,
     ):
-        super().__init__(f"Read{datasource.get_name()}", None, ray_remote_args)
+        if len(read_tasks) == estimated_num_blocks:
+            suffix = ""
+        else:
+            suffix = f"->SplitBlocks({int(estimated_num_blocks / len(read_tasks))})"
+        super().__init__(f"Read{datasource.get_name()}{suffix}", None, ray_remote_args)
         self._datasource = datasource
+        self._estimated_num_blocks = estimated_num_blocks
         self._read_tasks = read_tasks
+
+    def fusable(self) -> bool:
+        return self._estimated_num_blocks == len(self._read_tasks)
@@ -22,6 +22,7 @@
     Repartition,
 )
 from ray.data._internal.logical.operators.map_operator import AbstractUDFMap
+from ray.data._internal.logical.operators.read_operator import Read
 from ray.data._internal.stats import StatsDict
 from ray.data.block import Block
 
@@ -130,6 +131,9 @@ def _can_fuse(self, down_op: PhysicalOperator, up_op: PhysicalOperator) -> bool:
         down_logical_op = self._op_map[down_op]
         up_logical_op = self._op_map[up_op]
 
+        if isinstance(up_logical_op, Read) and not up_logical_op.fusable():
+            return False
+
         # If the downstream operator takes no input, it cannot be fused with
         # the upstream operator.
         if not down_logical_op._input_dependencies:

@@ -216,7 +216,7 @@ def get_plan_as_string(self, classname: str) -> str:
         if dataset_blocks is None:
             num_blocks = "?"
         else:
-            num_blocks = dataset_blocks.initial_num_blocks()
+            num_blocks = dataset_blocks.estimated_num_blocks()
         dataset_str = "{}(num_blocks={}, num_rows={}, schema={})".format(
             classname, num_blocks, count, schema_str
         )

@@ -58,7 +58,9 @@ def get_input_data() -> List[RefBundle]:
             for read_task in read_tasks
         ]
 
-    inputs = InputDataBuffer(input_data_factory=get_input_data)
+    inputs = InputDataBuffer(
+        input_data_factory=get_input_data, override_num_blocks=op._estimated_num_blocks
+    )
 
     def do_read(blocks: Iterator[ReadTask], _: TaskContext) -> Iterator[Block]:
         for read_task in blocks:

@@ -91,7 +91,7 @@ def _autodetect_parallelism(
     ctx: DataContext,
     reader: Optional["Reader"] = None,
     avail_cpus: Optional[int] = None,
-) -> (int, int):
+) -> (int, int, Optional[int]):
     """Returns parallelism to use and the min safe parallelism to avoid OOMs.
 
     This detects parallelism using the following heuristics, applied in order:
@@ -112,8 +112,9 @@ def _autodetect_parallelism(
         avail_cpus: Override avail cpus detection (for testing only).
 
     Returns:
-        Tuple of detected parallelism (only if -1 was specified), and the min safe
-        parallelism (which can be used to generate warnings about large blocks).
+        Tuple of detected parallelism (only if -1 was specified), the min safe
+        parallelism (which can be used to generate warnings about large blocks),
+        and the estimated inmemory size of the dataset.
     """
     min_safe_parallelism = 1
     max_reasonable_parallelism = sys.maxsize
@@ -141,7 +142,7 @@ def _autodetect_parallelism(
             f"estimated_available_cpus={avail_cpus} and "
             f"estimated_data_size={mem_size}."
         )
-    return parallelism, min_safe_parallelism
+    return parallelism, min_safe_parallelism, mem_size
 
 
 def _estimate_avail_cpus(cur_pg: Optional["PlacementGroup"]) -> int:

@@ -195,6 +195,7 @@ class ReadTask(Callable[[], Iterable[Block]]):
     def __init__(self, read_fn: Callable[[], Iterable[Block]], metadata: BlockMetadata):
         self._metadata = metadata
         self._read_fn = read_fn
+        self._additional_output_splits = 1
 
     def get_metadata(self) -> BlockMetadata:
         return self._metadata
@@ -211,13 +212,30 @@ def __call__(self) -> Iterable[Block]:
 
         if context.block_splitting_enabled:
             for block in result:
-                yield block
+                yield from self._do_additional_splits(block)
         else:
             builder = DelegatingBlockBuilder()
             for block in result:
                 builder.add_block(block)
             yield builder.build()
 
+    def _set_additional_split_factor(self, k: int) -> None:
+        self._additional_output_splits = k
+
+    def _do_additional_splits(self, block: Block) -> Iterable[Block]:
+        if self._additional_output_splits > 1:
+            block = BlockAccessor.for_block(block)
+            offset = 0
+            split_sizes = np.array_split(
+                range(block.num_rows()), self._additional_output_splits
+            )
+            for split in split_sizes:
+                size = len(split)
+                yield block.slice(offset, offset + size, copy=True)
+                offset += size
+        else:
+            yield block
+
 
 @PublicAPI
 class RangeDatasource(Datasource):

@@ -288,6 +288,14 @@ def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
 
             if meta.size_bytes is not None:
                 meta.size_bytes = int(meta.size_bytes * self._encoding_ratio)
+
+            if meta.num_rows is not None and meta.size_bytes is not None:
+                row_size = meta.size_bytes / meta.num_rows
+                default_read_batch_size = min(
+                    PARQUET_READER_ROW_BATCH_SIZE, 64e6 / row_size
+                )
+            else:
+                default_read_batch_size = PARQUET_READER_ROW_BATCH_SIZE
             block_udf, reader_args, columns, schema = (
                 self._block_udf,
                 self._reader_args,
@@ -299,6 +307,7 @@ def get_read_tasks(self, parallelism: int) -> List[ReadTask]:
                     lambda p=serialized_pieces: _read_pieces(
                         block_udf,
                         reader_args,
+                        default_read_batch_size,
                         columns,
                         schema,
                         p,
@@ -363,7 +372,12 @@ def _estimate_files_encoding_ratio(self) -> float:
 
 
 def _read_pieces(
-    block_udf, reader_args, columns, schema, serialized_pieces: List[_SerializedPiece]
+    block_udf,
+    reader_args,
+    default_read_batch_size,
+    columns,
+    schema,
+    serialized_pieces: List[_SerializedPiece],
 ) -> Iterator["pyarrow.Table"]:
     # This import is necessary to load the tensor extension type.
     from ray.data.extensions.tensor_extension import ArrowTensorType  # noqa
@@ -387,7 +401,7 @@ def _read_pieces(
 
     logger.debug(f"Reading {len(pieces)} parquet pieces")
     use_threads = reader_args.pop("use_threads", False)
-    batch_size = reader_args.pop("batch_size", PARQUET_READER_ROW_BATCH_SIZE)
+    batch_size = reader_args.pop("batch_size", default_read_batch_size)
     for piece in pieces:
         part = _get_partition_keys(piece.partition_expression)
         batches = piece.to_batches(

@@ -1,5 +1,6 @@
 import collections
 import logging
+import math
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -133,7 +134,7 @@ def from_items(
     if parallelism == 0:
         raise ValueError(f"parallelism must be -1 or > 0, got: {parallelism}")
 
-    detected_parallelism, _ = _autodetect_parallelism(
+    detected_parallelism, _, _ = _autodetect_parallelism(
         parallelism,
         ray.util.get_current_placement_group(),
         DataContext.get_current(),
@@ -350,9 +351,12 @@ def read_datasource(
             force_local = True
 
     if force_local:
-        requested_parallelism, min_safe_parallelism, read_tasks = _get_read_tasks(
-            datasource, ctx, cur_pg, parallelism, local_uri, read_args
-        )
+        (
+            requested_parallelism,
+            min_safe_parallelism,
+            inmemory_size,
+            read_tasks,
+        ) = _get_read_tasks(datasource, ctx, cur_pg, parallelism, local_uri, read_args)
     else:
         # Prepare read in a remote task at same node.
         # NOTE: in Ray client mode, this is expected to be run on head node.
@@ -365,7 +369,12 @@ def read_datasource(
             _get_read_tasks, retry_exceptions=False, num_cpus=0
         ).options(scheduling_strategy=scheduling_strategy)
 
-        requested_parallelism, min_safe_parallelism, read_tasks = ray.get(
+        (
+            requested_parallelism,
+            min_safe_parallelism,
+            inmemory_size,
+            read_tasks,
+        ) = ray.get(
             get_read_tasks.remote(
                 datasource,
                 ctx,
@@ -376,28 +385,51 @@ def read_datasource(
             )
         )
 
-    if read_tasks and len(read_tasks) < min_safe_parallelism * 0.7:
-        perc = 1 + round((min_safe_parallelism - len(read_tasks)) / len(read_tasks), 1)
-        logger.warning(
-            f"{WARN_PREFIX} The blocks of this dataset are estimated to be {perc}x "
-            "larger than the target block size "
-            f"of {int(ctx.target_max_block_size / 1024 / 1024)} MiB. This may lead to "
-            "out-of-memory errors during processing. Consider reducing the size of "
-            "input files or using `.repartition(n)` to increase the number of "
-            "dataset blocks."
-        )
-    elif len(read_tasks) < requested_parallelism and (
-        len(read_tasks) < ray.available_resources().get("CPU", 1) // 2
-    ):
-        logger.warning(
-            f"{WARN_PREFIX} The number of blocks in this dataset "
-            f"({len(read_tasks)}) "
-            f"limits its parallelism to {len(read_tasks)} concurrent tasks. "
-            "This is much less than the number "
-            "of available CPU slots in the cluster. Use `.repartition(n)` to "
-            "increase the number of "
-            "dataset blocks."
-        )
+    #    if read_tasks and len(read_tasks) < min_safe_parallelism * 0.7:
+    #        perc = 1 + round((min_safe_parallelism - len(read_tasks)) / len(read_tasks), 1)
+    #        logger.warning(
+    #            f"{WARN_PREFIX} The blocks of this dataset are estimated to be {perc}x "
+    #            "larger than the target block size "
+    #            f"of {int(ctx.target_max_block_size / 1024 / 1024)} MiB. This may lead to "
+    #            "out-of-memory errors during processing. Consider reducing the size of "
+    #            "input files or using `.repartition(n)` to increase the number of "
+    #            "dataset blocks."
+    #        )
+    #    elif len(read_tasks) < requested_parallelism and (
+    #        len(read_tasks) < ray.available_resources().get("CPU", 1) // 2
+    #    ):
+    #        logger.warning(
+    #            f"{WARN_PREFIX} The number of blocks in this dataset "
+    #            f"({len(read_tasks)}) "
+    #            f"limits its parallelism to {len(read_tasks)} concurrent tasks. "
+    #            "This is much less than the number "
+    #            "of available CPU slots in the cluster. Use `.repartition(n)` to "
+    #            "increase the number of "
+    #            "dataset blocks."
+    #        )
+
+    # TODO update the warnings above
+    if len(read_tasks) < requested_parallelism:
+        desired_splits_per_file = requested_parallelism / len(read_tasks)
+        print("Desired splits per file", desired_splits_per_file)
+        if inmemory_size:
+            expected_block_size = inmemory_size / len(read_tasks)
+            print("Expected block size", expected_block_size)
+            size_based_splits = math.floor(
+                max(1, expected_block_size / ctx.target_max_block_size)
+            )
+            print("Size based splits", size_based_splits)
+        else:
+            size_based_splits = 1
+        k = math.ceil(desired_splits_per_file / size_based_splits)
+        estimated_num_blocks = len(read_tasks) * size_based_splits * k
+        print("Additional split factor", k)
+        for r in read_tasks:
+            r._set_additional_split_factor(k)
+        print("Estimated num blocks", estimated_num_blocks)
+    else:
+        print("No additional splits are needed")
+        estimated_num_blocks = len(read_tasks)
 
     read_stage_name = f"Read{datasource.get_name()}"
     available_cpu_slots = ray.available_resources().get("CPU", 1)
@@ -423,10 +455,11 @@ def read_datasource(
         ray_remote_args=ray_remote_args,
         owned_by_consumer=False,
     )
+    block_list._estimated_num_blocks = estimated_num_blocks
 
     # TODO(hchen): move _get_read_tasks and related code to the Read physical operator,
     # after removing LazyBlockList code path.
-    read_op = Read(datasource, read_tasks, ray_remote_args)
+    read_op = Read(datasource, read_tasks, estimated_num_blocks, ray_remote_args)
     logical_plan = LogicalPlan(read_op)
 
     return Dataset(
@@ -1947,7 +1980,7 @@ def _get_read_tasks(
     parallelism: int,
     local_uri: bool,
     kwargs: dict,
-) -> Tuple[int, int, List[ReadTask]]:
+) -> Tuple[int, int, Optional[int], List[ReadTask]]:
     """Generates read tasks.
 
     Args:
@@ -1959,19 +1992,20 @@ def _get_read_tasks(
 
     Returns:
         Request parallelism from the datasource, the min safe parallelism to avoid
-        OOM, and the list of read tasks generated.
+        OOM, the estimated inmemory data size, and list of read tasks generated.
     """
     kwargs = _unwrap_arrow_serialization_workaround(kwargs)
     if local_uri:
         kwargs["local_uri"] = local_uri
     DataContext._set_current(ctx)
     reader = ds.create_reader(**kwargs)
-    requested_parallelism, min_safe_parallelism = _autodetect_parallelism(
+    requested_parallelism, min_safe_parallelism, mem_size = _autodetect_parallelism(
         parallelism, cur_pg, DataContext.get_current(), reader
     )
     return (
         requested_parallelism,
         min_safe_parallelism,
+        mem_size,
         reader.get_read_tasks(requested_parallelism),
     )