dask · gjoseph92 · Sep 16, 2021 · Sep 16, 2021 · Sep 17, 2021 · Sep 17, 2021
@@ -2485,12 +2485,13 @@ def decide_worker(self, ts: TaskState) -> WorkerState:
             ts.state = "no-worker"
             return ws
 
-        # Group is larger than cluster with few dependencies? Minimize future data transfers.
+        # Group fills the cluster and dependencies are much smaller than cluster? Minimize future data transfers.
+        ndeps_cutoff: Py_ssize_t = min(5, len(self._workers_dv))
         if (
             valid_workers is None
-            and len(group) > self._total_nthreads * 2
-            and len(group._dependencies) < 5
-            and sum(map(len, group._dependencies)) < 5
+            and len(group) >= self._total_nthreads
+            and len(group._dependencies) < ndeps_cutoff
+            and sum(map(len, group._dependencies)) < ndeps_cutoff
         ):
             ws: WorkerState = group._last_worker
 
@@ -7982,7 +7983,14 @@ def decide_worker(
     if ts._actor:
         candidates = set(all_workers)
     else:
-        candidates = {wws for dts in deps for wws in dts._who_has}
+        candidates = {
+            wws
+            for dts in deps
+            # Ignore dependencies that will need to be, or already are, copied to all workers
+            if max(len(dts._dependents) / len(dts._group), len(dts._who_has))
 for tts in s: 
     if tts._processing_on is not None: 
         wws = tts._processing_on 
         comm: double = self.get_comm_cost(tts, wws) 
         old: double = wws._processing[tts] 
         new: double = avg_duration + comm 
         diff: double = new - old 
         wws._processing[tts] = new 
         wws._occupancy += diff 
         self._total_occupancy += diff 
 for tts in s: 
     if tts._processing_on is not None: 
         wws = tts._processing_on 
         comm: double = self.get_comm_cost(tts, wws) 
         old: double = wws._processing[tts] 
         new: double = avg_duration + comm 
         diff: double = new - old 
         wws._processing[tts] = new 
         wws._occupancy += diff 
         self._total_occupancy += diff 
+            < len(valid_workers if valid_workers is not None else all_workers)
+            for wws in dts._who_has
+        }
     if valid_workers is None:
         if not candidates:
             candidates = set(all_workers)

@@ -138,6 +138,9 @@ async def test_decide_worker_with_restrictions(client, s, a, b, c):
     ],
 )
 def test_decide_worker_coschedule_order_neighbors(ndeps, nthreads):
+    if ndeps >= len(nthreads):
+        pytest.skip()
+
     @gen_cluster(
         client=True,
         nthreads=nthreads,
@@ -237,6 +240,153 @@ def random(**kwargs):
     test()
 
 
+@gen_cluster(client=True, nthreads=[("127.0.0.1", 1)] * 4)
+async def test_decide_worker_common_dep_ignored(client, s, *workers):
+    r"""
+    When we have basic linear chains, but all the downstream tasks also share a common dependency, ignore that dependency.
+
+    i  j  k  l   m  n  o  p
+    \__\__\__\___/__/__/__/
+    |  |  |  | | |  |  |  |
+    |  |  |  | X |  |  |  |
+    a  b  c  d   e  f  g  h
+
+               ^ Ignore the location of X when picking a worker for i..p.
+                 It will end up being copied to all workers anyway.
+
+    If a dependency will end up on every worker regardless, because many things depend on it,
+    we should ignore it when selecting our candidate workers. Otherwise, we'll end up considering
+    every worker as a candidate, which is 1) slow and 2) often leads to poor choices.
+    """
+    roots = [
+        delayed(slowinc)(1, 0.1 / (i + 1), dask_key_name=f"root-{i}") for i in range(16)
+    ]
+    # This shared dependency will get copied to all workers, eventually making all workers valid candidates for each dep
+    everywhere = delayed(None, name="everywhere")
+    deps = [
+        delayed(lambda x, y: None)(r, everywhere, dask_key_name=f"dep-{i}")
+        for i, r in enumerate(roots)
+    ]
+
+    rs, ds = dask.persist(roots, deps)
+    await wait(ds)
+
+    keys = {
+        worker.name: dict(
+            root_keys=sorted(
+                [int(k.split("-")[1]) for k in worker.data if k.startswith("root")]
+            ),
+            deps_of_root=sorted(
+                [int(k.split("-")[1]) for k in worker.data if k.startswith("dep")]
+            ),
+        )
+        for worker in workers
+    }
+
+    for k in keys.values():
+        assert k["root_keys"] == k["deps_of_root"]
+
+    for worker in workers:
+        log = worker.incoming_transfer_log
+        if log:
+            assert len(log) == 1
+            assert list(log[0]["keys"]) == ["everywhere"]
+
+
+@gen_cluster(client=True, nthreads=[("127.0.0.1", 1)] * 4)
+async def test_decide_worker_large_subtrees_colocated(client, s, *workers):
+    r"""
+    Ensure that the above "ignore common dependencies" logic doesn't affect wide (but isolated) subtrees.
+
+    ........  ........  ........  ........
+    \\\\////  \\\\////  \\\\////  \\\\////
+       a         b         c         d
+
+    Each one of a, b, etc. has more dependents than there are workers. But just because a has
+    lots of dependents doesn't necessarily mean it will end up copied to every worker.
+    Because a also has a few siblings, a's dependents shouldn't spread out over the whole cluster.
+    """
+    roots = [delayed(inc)(i, dask_key_name=f"root-{i}") for i in range(len(workers))]
+    deps = [
+        delayed(inc)(r, dask_key_name=f"dep-{i}-{j}")
+        for i, r in enumerate(roots)
+        for j in range(len(workers) * 2)
+    ]
+
+    rs, ds = dask.persist(roots, deps)
+    await wait(ds)
+
+    keys = {
+        worker.name: dict(
+            root_keys=set(
+                int(k.split("-")[1]) for k in worker.data if k.startswith("root")
+            ),
+            deps_of_root=set(
+                int(k.split("-")[1]) for k in worker.data if k.startswith("dep")
+            ),
+        )
+        for worker in workers
+    }
+
+    for k in keys.values():
+        assert k["root_keys"] == k["deps_of_root"]
+        assert len(k["root_keys"]) == len(roots) / len(workers)
+
+    for worker in workers:
+        assert not worker.incoming_transfer_log
+
+
+@gen_cluster(
+    client=True,
+    nthreads=[("127.0.0.1", 1)] * 4,
+    config={"distributed.scheduler.work-stealing": False},
+)
+async def test_decide_worker_large_multiroot_subtrees_colocated(client, s, *workers):
+    r"""
+    Same as the above test, but also check isolated trees with multiple roots.
+
+    ........  ........  ........  ........
+    \\\\////  \\\\////  \\\\////  \\\\////
+       a b      c d       e f       g h
+    """
+    roots = [
+        delayed(inc)(i, dask_key_name=f"root-{i}") for i in range(len(workers) * 2)
+    ]
+    deps = [
+        delayed(lambda x, y: None)(
+            r, roots[i * 2 + 1], dask_key_name=f"dep-{i * 2}-{j}"
+        )
+        for i, r in enumerate(roots[::2])
+        for j in range(len(workers) * 2)
+    ]
+
+    rs, ds = dask.persist(roots, deps)
+    await wait(ds)
+
+    keys = {
+        worker.name: dict(
+            root_keys=set(
+                int(k.split("-")[1]) for k in worker.data if k.startswith("root")
+            ),
+            deps_of_root=set().union(
+                *(
+                    (int(k.split("-")[1]), int(k.split("-")[1]) + 1)
+                    for k in worker.data
+                    if k.startswith("dep")
+                )
+            ),
+        )
+        for worker in workers
+    }
+
+    for k in keys.values():
+        assert k["root_keys"] == k["deps_of_root"]
+        assert len(k["root_keys"]) == len(roots) / len(workers)
+
+    for worker in workers:
+        assert not worker.incoming_transfer_log
+
+
 @gen_cluster(client=True, nthreads=[("127.0.0.1", 1)] * 3)
 async def test_move_data_over_break_restrictions(client, s, a, b, c):
     [x] = await client.scatter([1], workers=b.address)