ray-project · rkooo567 · Aug 30, 2024 · Aug 23, 2024
diff --git a/python/ray/dag/compiled_dag_node.py b/python/ray/dag/compiled_dag_node.py
@@ -8,6 +8,7 @@
 import traceback
 from typing import NamedTuple
 
+from ray.experimental.channel.cached_channel import CachedChannel
 import ray
 from ray.exceptions import RayTaskError, RayChannelError
 from ray.util.annotations import PublicAPI
@@ -247,13 +248,14 @@ def __init__(
         self.input_variant = input_variant
         self.channel_idx = channel_idx
 
-    def resolve(self, channel_results: Any):
+    def resolve(self, channel_results: Any) -> Any:
         """
         Resolve the input value from the channel results.
 
         Args:
             channel_results: The results from reading the input channels.
         """
+
         if isinstance(self.input_variant, ChannelInterface):
             value = channel_results[self.channel_idx]
         elif isinstance(self.input_variant, DAGInputAdapter):
@@ -455,7 +457,11 @@ def _write(self) -> bool:
             exit = True
         return exit
 
-    def exec_operation(self, class_handle, op_type: _DAGNodeOperationType) -> bool:
+    def exec_operation(
+        self,
+        class_handle,
+        op_type: _DAGNodeOperationType,
+    ) -> bool:
         """
         An ExecutableTask corresponds to a DAGNode. It consists of three
         operations: READ, COMPUTE, and WRITE, which should be executed in
@@ -1023,12 +1029,17 @@ def _get_or_compile(
                         )
                     )
                 else:
+                    # Use reader_handles_set to deduplicate readers on the same actor,
+                    # because with CachedChannel each actor will only read from the
+                    # upstream channel once.
+                    reader_handles_set = set()
                     for reader in readers:
                         reader_handle = reader.dag_node._get_actor_handle()
-                        reader_and_node_list.append(
-                            (reader_handle, self._get_node_id(reader_handle))
-                        )
-
+                        if reader_handle not in reader_handles_set:
+                            reader_and_node_list.append(
+                                (reader_handle, self._get_node_id(reader_handle))
+                            )
+                        reader_handles_set.add(reader_handle)
                 fn = task.dag_node._get_remote_method("__ray_call__")
                 task.output_channel = ray.get(
                     fn.remote(
@@ -1110,34 +1121,94 @@ def _get_or_compile(
 
         # Create executable tasks for each actor
         for actor_handle, tasks in self.actor_to_tasks.items():
-            executable_tasks = []
+            # Dict from the arg to the set of tasks that consume it.
+            arg_to_consumers: Dict[DAGNode, Set[CompiledTask]] = defaultdict(set)
+            # The number of tasks that consume InputNode (or InputAttributeNode)
+            # Note that _preprocess() ensures that all tasks either use InputNode
+            # or use InputAttributeNode, but not both.
+            num_input_consumers = 0
+
+            # Step 1: populate num_channel_reads and perform some validation.
             for task in tasks:
-                resolved_args = []
                 has_at_least_one_channel_input = False
                 for arg in task.args:
                     if isinstance(arg, InputNode):
-                        input_adapter = DAGInputAdapter(None, self.dag_input_channel)
-                        resolved_args.append(input_adapter)
                         has_at_least_one_channel_input = True
+                        arg_to_consumers[arg].add(task)
+                        num_input_consumers = max(
+                            num_input_consumers, len(arg_to_consumers[arg])
+                        )
                     elif isinstance(arg, InputAttributeNode):
-                        input_adapter = DAGInputAdapter(arg, self.dag_input_channel)
-                        resolved_args.append(input_adapter)
                         has_at_least_one_channel_input = True
+                        arg_to_consumers[arg].add(task)
+                        num_input_consumers = max(
+                            num_input_consumers, len(arg_to_consumers[arg])
+                        )
                     elif isinstance(arg, DAGNode):  # Other DAGNodes
+                        has_at_least_one_channel_input = True
+                        arg_to_consumers[arg].add(task)
                         arg_idx = self.dag_node_to_idx[arg]
                         arg_channel = self.idx_to_task[arg_idx].output_channel
                         assert arg_channel is not None
-                        resolved_args.append(arg_channel)
-                        has_at_least_one_channel_input = True
-                    else:
-                        resolved_args.append(arg)
                 # TODO: Support no-input DAGs (use an empty object to signal).
                 if not has_at_least_one_channel_input:
                     raise ValueError(
                         "Compiled DAGs require each task to take a "
                         "ray.dag.InputNode or at least one other DAGNode as an "
                         "input"
                     )
+
+            # Step 2: create cached channels if needed
+
+            # Dict from original channel to the channel to be used in execution.
+            # The value of this dict is either the original channel or a newly
+            # created CachedChannel (if the original channel is read more than once).
+            channel_dict: Dict[ChannelInterface, ChannelInterface] = {}
+            for arg, consumers in arg_to_consumers.items():
+                # Handle non-input args
+                if not isinstance(arg, InputNode) and not isinstance(
+                    arg, InputAttributeNode
+                ):
+                    arg_idx = self.dag_node_to_idx[arg]
+                    arg_channel = self.idx_to_task[arg_idx].output_channel
+                    if len(consumers) > 1:
+                        channel_dict[arg_channel] = CachedChannel(
+                            len(consumers),
+                            arg_channel,
+                        )
+                    else:
+                        channel_dict[arg_channel] = arg_channel
+            # Handle input args
+            if num_input_consumers > 1:
+                channel_dict[self.dag_input_channel] = CachedChannel(
+                    num_input_consumers,
+                    self.dag_input_channel,
+                )
+            else:
+                channel_dict[self.dag_input_channel] = self.dag_input_channel
+
+            # Step 3: create executable tasks for the actor
+            executable_tasks = []
+            for task in tasks:
+                resolved_args = []
+                for arg in task.args:
+                    if isinstance(arg, InputNode):
+                        input_channel = channel_dict[self.dag_input_channel]
+                        input_adapter = DAGInputAdapter(None, input_channel)
+                        resolved_args.append(input_adapter)
+                    elif isinstance(arg, InputAttributeNode):
+                        input_channel = channel_dict[self.dag_input_channel]
+                        input_adapter = DAGInputAdapter(arg, input_channel)
+                        resolved_args.append(input_adapter)
+                    elif isinstance(arg, DAGNode):  # Other DAGNodes
+                        arg_idx = self.dag_node_to_idx[arg]
+                        arg_channel = self.idx_to_task[arg_idx].output_channel
+                        assert arg_channel is not None
+                        arg_channel = channel_dict[arg_channel]
+                        resolved_args.append(arg_channel)
+                    else:
+                        # Constant arg
+                        resolved_args.append(arg)
                 executable_task = ExecutableTask(
                     task,
                     resolved_args,

diff --git a/python/ray/dag/tests/experimental/test_accelerated_dag.py b/python/ray/dag/tests/experimental/test_accelerated_dag.py
@@ -267,6 +267,82 @@ def test_actor_method_bind_same_constant(ray_start_regular):
     compiled_dag.teardown()
 
 
+def test_actor_method_bind_same_input(ray_start_regular):
+    actor = Actor.remote(0)
+    with InputNode() as inp:
+        # Test binding input node to the same method
+        # of same actor multiple times: execution
+        # should not hang.
+        output1 = actor.inc.bind(inp)
+        output2 = actor.inc.bind(inp)
+        dag = MultiOutputNode([output1, output2])
+    compiled_dag = dag.experimental_compile()
+    expected = [[0, 0], [1, 2], [4, 6]]
+    for i in range(3):
+        ref = compiled_dag.execute(i)
+        result = ray.get(ref)
+        assert result == expected[i]
+    compiled_dag.teardown()
+
+
+def test_actor_method_bind_same_input_attr(ray_start_regular):
+    actor = Actor.remote(0)
+    with InputNode() as inp:
+        # Test binding input attribute node to the same method
+        # of same actor multiple times: execution should not
+        # hang.
+        output1 = actor.inc.bind(inp[0])
+        output2 = actor.inc.bind(inp[0])
+        dag = MultiOutputNode([output1, output2])
+    compiled_dag = dag.experimental_compile()
+    expected = [[0, 0], [1, 2], [4, 6]]
+    for i in range(3):
+        ref = compiled_dag.execute(i)
+        result = ray.get(ref)
+        assert result == expected[i]
+    compiled_dag.teardown()
+
+
+def test_actor_method_bind_same_arg(ray_start_regular):
+    a1 = Actor.remote(0)
+    a2 = Actor.remote(0)
+    with InputNode() as inp:
+        # Test binding arg to the same method
+        # of same actor multiple times: execution
+        # should not hang.
+        output1 = a1.echo.bind(inp)
+        output2 = a2.inc.bind(output1)
+        output3 = a2.inc.bind(output1)
+        dag = MultiOutputNode([output2, output3])
+    compiled_dag = dag.experimental_compile()
+    expected = [[0, 0], [1, 2], [4, 6]]
+    for i in range(3):
+        ref = compiled_dag.execute(i)
+        result = ray.get(ref)
+        assert result == expected[i]
+    compiled_dag.teardown()
+
+
+def test_mixed_bind_same_input(ray_start_regular):
+    a1 = Actor.remote(0)
+    a2 = Actor.remote(0)
+    with InputNode() as inp:
+        # Test binding input node to the same method
+        # of different actors multiple times: execution
+        # should not hang.
+        output1 = a1.inc.bind(inp)
+        output2 = a1.inc.bind(inp)
+        output3 = a2.inc.bind(inp)
+        dag = MultiOutputNode([output1, output2, output3])
+    compiled_dag = dag.experimental_compile()
+    expected = [[0, 0, 0], [1, 2, 1], [4, 6, 3]]
+    for i in range(3):
+        ref = compiled_dag.execute(i)
+        result = ray.get(ref)
+        assert result == expected[i]
+    compiled_dag.teardown()
+
+
 def test_regular_args(ray_start_regular):
     # Test passing regular args to .bind in addition to DAGNode args.
     a = Actor.remote(0)
@@ -304,14 +380,15 @@ def test_multi_args_basic(ray_start_regular):
 def test_multi_args_single_actor(ray_start_regular):
     c = Collector.remote()
     with InputNode() as i:
-        dag = c.collect_two.bind(i[1], i[0])
+        dag = c.collect_three.bind(i[0], i[1], i[0])
 
     compiled_dag = dag.experimental_compile()
 
+    expected = [[0, 1, 0], [0, 1, 0, 1, 2, 1], [0, 1, 0, 1, 2, 1, 2, 3, 2]]
     for i in range(3):
-        ref = compiled_dag.execute(2, 3)
+        ref = compiled_dag.execute(i, i + 1)
         result = ray.get(ref)
-        assert result == [3, 2] * (i + 1)
+        assert result == expected[i]
 
     with pytest.raises(
         ValueError,
@@ -1424,6 +1501,24 @@ def get_node_id(self):
     compiled_dag.teardown()
 
 
+@ray.remote
+class TestWorker:
+    def add_one(self, value):
+        return value + 1
+
+    def add(self, val1, val2):
+        return val1 + val2
+
+    def generate_torch_tensor(self, size) -> torch.Tensor:
+        return torch.zeros(size)
+
+    def add_value_to_tensor(self, value: int, tensor: torch.Tensor) -> torch.Tensor:
+        """
+        Add `value` to all elements of the tensor.
+        """
+        return tensor + value
+
+
 class TestActorInputOutput:
     """
     Accelerated DAGs support the following two cases for the input/output of the graph:
@@ -1436,23 +1531,6 @@ class TestActorInputOutput:
     which is an actor, needs to be the input and output of the graph.
     """
 
-    @ray.remote
-    class Worker:
-        def add_one(self, value):
-            return value + 1
-
-        def add(self, val1, val2):
-            return val1 + val2
-
-        def generate_torch_tensor(self, size) -> torch.Tensor:
-            return torch.zeros(size)
-
-        def add_value_to_tensor(self, value: int, tensor: torch.Tensor) -> torch.Tensor:
-            """
-            Add `value` to all elements of the tensor.
-            """
-            return tensor + value
-
     def test_shared_memory_channel_only(ray_start_cluster):
         """
         Replica -> Worker -> Replica
@@ -1463,7 +1541,7 @@ def test_shared_memory_channel_only(ray_start_cluster):
         @ray.remote
         class Replica:
             def __init__(self):
-                self.w = TestActorInputOutput.Worker.remote()
+                self.w = TestWorker.remote()
                 with InputNode() as inp:
                     dag = self.w.add_one.bind(inp)
                 self.compiled_dag = dag.experimental_compile()
@@ -1487,7 +1565,7 @@ def test_intra_process_channel(ray_start_cluster):
         @ray.remote
         class Replica:
             def __init__(self):
-                self.w = TestActorInputOutput.Worker.remote()
+                self.w = TestWorker.remote()
                 with InputNode() as inp:
                     dag = self.w.add_one.bind(inp)
                     dag = self.w.add_one.bind(dag)
@@ -1512,8 +1590,8 @@ def test_multiple_readers_multiple_writers(ray_start_cluster):
         @ray.remote
         class Replica:
             def __init__(self):
-                w1 = TestActorInputOutput.Worker.remote()
-                w2 = TestActorInputOutput.Worker.remote()
+                w1 = TestWorker.remote()
+                w2 = TestWorker.remote()
                 with InputNode() as inp:
                     dag = MultiOutputNode([w1.add_one.bind(inp), w2.add_one.bind(inp)])
                 self.compiled_dag = dag.experimental_compile()
@@ -1539,8 +1617,8 @@ def test_multiple_readers_single_writer(ray_start_cluster):
         @ray.remote
         class Replica:
             def __init__(self):
-                w1 = TestActorInputOutput.Worker.remote()
-                w2 = TestActorInputOutput.Worker.remote()
+                w1 = TestWorker.remote()
+                w2 = TestWorker.remote()
                 with InputNode() as inp:
                     branch1 = w1.add_one.bind(inp)
                     branch2 = w2.add_one.bind(inp)
@@ -1567,8 +1645,8 @@ def test_single_reader_multiple_writers(ray_start_cluster):
         @ray.remote
         class Replica:
             def __init__(self):
-                w1 = TestActorInputOutput.Worker.remote()
-                w2 = TestActorInputOutput.Worker.remote()
+                w1 = TestWorker.remote()
+                w2 = TestWorker.remote()
                 with InputNode() as inp:
                     dag = w1.add_one.bind(inp)
                     dag = MultiOutputNode([w1.add_one.bind(dag), w2.add_one.bind(dag)])
@@ -1592,8 +1670,8 @@ def test_torch_tensor_type(ray_start_cluster):
         @ray.remote
         class Replica:
             def __init__(self):
-                self._base = TestActorInputOutput.Worker.remote()
-                self._refiner = TestActorInputOutput.Worker.remote()
+                self._base = TestWorker.remote()
+                self._refiner = TestWorker.remote()
 
                 with ray.dag.InputNode() as inp:
                     dag = self._refiner.add_value_to_tensor.bind(

diff --git a/python/ray/experimental/channel/__init__.py b/python/ray/experimental/channel/__init__.py
@@ -1,3 +1,4 @@
+from ray.experimental.channel.cached_channel import CachedChannel
 from ray.experimental.channel.common import (  # noqa: F401
     AwaitableBackgroundReader,
     AwaitableBackgroundWriter,
@@ -16,6 +17,7 @@
 __all__ = [
     "AwaitableBackgroundReader",
     "AwaitableBackgroundWriter",
+    "CachedChannel",
     "Channel",
     "ReaderInterface",
     "SynchronousReader",