dask · fjetter · Mar 23, 2022 · Mar 9, 2022 · Mar 9, 2022 · Mar 9, 2022
@@ -8,6 +8,7 @@
 import msgpack
 
 from distributed.compatibility import to_thread
+from distributed.stories import scheduler_story, worker_story
 
 
 def _tuple_to_list(node):
@@ -57,3 +58,112 @@ def writer(state: dict, f: IO):
         # Write from a thread so we don't block the event loop quite as badly
         # (the writer will still hold the GIL a lot though).
         await to_thread(writer, state, f)
+
+
+def load_cluster_dump(url: str):
+    if url.endswith(".msgpack.gz"):
+        mode = "rb"
+        reader = msgpack.unpack
+    elif url.endswith(".yaml"):
+        import yaml
+
+        mode = "r"
+        reader = yaml.safe_load
+    else:
+        raise ValueError(f"url ({url}) must have a .msgpack.gz or .yaml suffix")
+
+    with fsspec.open(url, mode, compression="infer") as f:
+        return reader(f)
+
+
+class DumpInspector:
+    """
+    Utility class for inspecting the state of a cluster dump
+
+    .. code-block:: python
+
+        inspector = DumpInspect("dump.msgpack.gz")
+        memory_tasks = inspector.tasks_in_state("memory")
+        released_tasks = inspector.tasks_in_state("released)
+    """
+
+    def __init__(self, url_or_state: str | dict):
+        if isinstance(url_or_state, str):
+            self.dump = load_cluster_dump(url_or_state)
+        elif isinstance(url_or_state, dict):
+            self.dump = url_or_state
+        else:
+            raise TypeError("'url_or_state' must be a str or dict")
+
+    def tasks_in_state(self, state: str = "", workers: bool = False) -> dict:
+        """
+        Returns
+        -------
+        tasks : dict
+            A dictionary of scheduler tasks with state `state`.
+            worker tasks are included if `workers=True`
+        """
+        stasks = self.dump["scheduler"]["tasks"]
+
+        if state:
+            tasks = {k: v for k, v in stasks.items() if v["state"] == state}
+        else:
+            tasks = stasks.copy()
+
+        if not workers:
+            return tasks
+
+        for worker_dump in self.dump["workers"].values():
+            if self._valid_worker_dump(worker_dump):
+                if state:
+                    tasks.update(
+                        (k, v)
+                        for k, v in worker_dump["tasks"].items()
+                        if v["state"] == state
+                    )
+                else:
+                    tasks.update(worker_dump["tasks"])
+
+        return tasks
+
+    def _valid_worker_dump(self, worker_dump):
+        # Worker dumps should be a dictionaries but can also be
+        # strings describing comm Failures
+        return isinstance(worker_dump, dict)
+
+    def story(self, *key_or_stimulus_id: str, workers: bool = False) -> list:
+        """
+        Returns
+        -------
+        stories : list
+            A list of stories for the keys/stimulus ID's in `*key_or_stimulus_id`.
+            worker stories are included if `workers=True`
+        """
+        keys = set(key_or_stimulus_id)
+        story = scheduler_story(keys, self.dump["scheduler"]["transition_log"])
+
+        if not workers:
+            return story
+
+        for wdump in self.dump["workers"].values():
+            if self._valid_worker_dump(wdump):
+                story.extend(worker_story(keys, wdump["log"]))
+
+        return story
+
+    def missing_workers(self) -> list:
+        """
+        Returns
+        -------
+        missing : list
+            A list of workers connected to the scheduler, but which
+            did not respond to requests for a state dump.
+        """
+        scheduler_workers = self.dump["scheduler"]["workers"]
+        responsive_workers = self.dump["workers"]
+        return [
+            w
+            for w in scheduler_workers.keys()
+            if w not in responsive_workers
+            or not self._valid_worker_dump(responsive_workers[w])
+        ]
@@ -84,6 +84,7 @@
 from .security import Security
 from .semaphore import SemaphoreExtension
 from .stealing import WorkStealing
+from .stories import scheduler_story
 from .utils import (
     All,
     TimeoutError,
@@ -7533,9 +7534,7 @@ def transitions(self, recommendations: dict):
     def story(self, *keys):
         """Get all transitions that touch one of the input keys"""
         keys = {key.key if isinstance(key, TaskState) else key for key in keys}
-        return [
-            t for t in self.transition_log if t[0] in keys or keys.intersection(t[3])
-        ]
+        return scheduler_story(keys, self.transition_log)
 
     transition_story = story
 

@@ -0,0 +1,44 @@
+from typing import Iterable
+
+
+def scheduler_story(keys: set, transition_log: Iterable):
+    """Creates a story from the scheduler transition log given a set of keys
+    describing tasks or stimuli.
+
+    Parameters
+    ----------
+    keys : set
+        A set of task `keys` or `stimulus_id`'s
+    log : iterable
+        The scheduler transition log
+
+    Returns
+    -------
+    story : list
+    """
+    return [t for t in transition_log if t[0] in keys or keys.intersection(t[3])]
+
+
+def worker_story(keys: set, log: Iterable):
+    """Creates a story from the worker log given a set of keys
+    describing tasks or stimuli.
+
+    Parameters
+    ----------
+    keys : set
+        A set of task `keys` or `stimulus_id`'s
+    log : iterable
+        The worker log
+
+    Returns
+    -------
+    story : list
+    """
+    return [
+        msg
+        for msg in log
+        if any(key in msg for key in keys)
+        or any(
+            key in c for key in keys for c in msg if isinstance(c, (tuple, list, set))
+        )
+    ]
@@ -31,6 +31,7 @@
 from tlz import concat, first, identity, isdistinct, merge, pluck, valmap
 
 import dask
+import dask.array as da
 import dask.bag as db
 from dask import delayed
 from dask.optimization import SubgraphCallable
@@ -63,6 +64,7 @@
     tokenize,
     wait,
 )
+from distributed.cluster_dump import DumpInspector, load_cluster_dump
 from distributed.comm import CommClosedError
 from distributed.compatibility import LINUX, WINDOWS
 from distributed.core import Status
@@ -7261,22 +7263,9 @@ def test_print_simple(capsys):
 
 
 def _verify_cluster_dump(url, format: str, addresses: set[str]) -> dict:
-    fsspec = pytest.importorskip("fsspec")
-
-    url = str(url)
-    if format == "msgpack":
-        import msgpack
-
-        url += ".msgpack.gz"
-        loader = msgpack.unpack
-    else:
-        import yaml
-
-        url += ".yaml"
-        loader = yaml.safe_load
-
-    with fsspec.open(url, mode="rb", compression="infer") as f:
-        state = loader(f)
+    fsspec = pytest.importorskip("fsspec")  # for load_cluster_dump
+    url = str(url) + (".msgpack.gz" if format == "msgpack" else ".yaml")
+    state = load_cluster_dump(url)
 
     assert isinstance(state, dict)
     assert "scheduler" in state
@@ -7345,6 +7334,31 @@ async def test_dump_cluster_state_json(c, s, a, b, tmp_path, local):
         await c.dump_cluster_state(filename, format="json")
 
 
+@pytest.mark.parametrize("local", [True, False])
+@pytest.mark.parametrize("_format", ["msgpack", "yaml"])
+@pytest.mark.parametrize("workers", [True, False])
+@gen_cluster(client=True)
+async def test_inspect_cluster_dump(c, s, a, b, tmp_path, _format, local, workers):
+    filename = tmp_path / "foo"
+    if not local:
+        pytest.importorskip("fsspec")
+        # Make it look like an fsspec path
+        filename = f"file://{filename}"
+
+    A = da.ones(100, chunks=25)
+    await c.persist(A)
+    await c.dump_cluster_state(filename, format=_format)
+
+    suffix = ".gz" if _format == "msgpack" else ""
+    inspector = DumpInspector(f"{filename}.{_format}{suffix}")
+    tasks = inspector.tasks_in_state("memory", workers=workers)
+    assert set(tasks.keys()) == set(map(str, A.__dask_keys__()))
+    it = iter(tasks.keys())
+    stories = inspector.story(next(it), workers=workers)
+    stories = inspector.story(next(it), next(it), workers=workers)
+    missing = inspector.missing_workers()
+
+
 @gen_cluster(client=True)
 async def test_dump_cluster_state_exclude_default(c, s, a, b, tmp_path):
     futs = c.map(inc, range(10))

@@ -71,6 +71,7 @@
 from .security import Security
 from .shuffle import ShuffleWorkerExtension
 from .sizeof import safe_sizeof as sizeof
+from .stories import worker_story
 from .threadpoolexecutor import ThreadPoolExecutor
 from .threadpoolexecutor import secede as tpe_secede
 from .utils import (
@@ -2894,18 +2895,8 @@ def stateof(self, key: str) -> dict[str, Any]:
         }
 
     def story(self, *keys_or_tasks: str | TaskState) -> list[tuple]:
-        keys = [e.key if isinstance(e, TaskState) else e for e in keys_or_tasks]
-        return [
-            msg
-            for msg in self.log
-            if any(key in msg for key in keys)
-            or any(
-                key in c
-                for key in keys
-                for c in msg
-                if isinstance(c, (tuple, list, set))
-            )
-        ]
+        keys = {e.key if isinstance(e, TaskState) else e for e in keys_or_tasks}
+        return worker_story(keys, self.log)
 
     def ensure_communicating(self) -> None:
         stimulus_id = f"ensure-communicating-{time()}"