Lightning-AI · nisheethlahoti · Jul 27, 2023 · Jul 31, 2023 · Jul 31, 2023 · Jul 31, 2023
@@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added support for hydra multi-run with DDP ([#18175](https://github.com/Lightning-AI/lightning/pull/18175))
+
+
 - Added support for the TPU-v4 architecture ([#17227](https://github.com/Lightning-AI/lightning/pull/17227))
 
 

@@ -14,6 +14,7 @@
 import os
 import subprocess
 import sys
+from pathlib import Path
 from typing import Any, Callable, Optional, Sequence, Tuple
 
 from lightning_utilities.core.imports import RequirementCache
@@ -144,6 +145,7 @@ def _basic_subprocess_cmd() -> Sequence[str]:
 def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
     import __main__  # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
     from hydra.core.hydra_config import HydraConfig
+    from hydra.types import RunMode
     from hydra.utils import get_original_cwd, to_absolute_path
 
     # when user is using hydra find the absolute path
@@ -152,10 +154,18 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
     else:
         command = [sys.executable, "-m", __main__.__spec__.name]
 
-    command += sys.argv[1:]
-
     cwd = get_original_cwd()
-    rundir = f'"{HydraConfig.get().run.dir}"'
-    # Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
+    hydra_cfg = HydraConfig.get()
+    rundir = Path(hydra_cfg.runtime.output_dir)
+
+    # For multi-run, hydra_cfg.mode is RunMode.MULTIRUN, else it's RunMode.RUN
+    if hydra_cfg.mode == RunMode.RUN:  # Just run current command again
+        command += sys.argv[1:]
+    elif hydra_cfg.output_subdir is None:
+        raise RuntimeError("DDP with multirun requires saved config file")
+    else:  # Used saved config for new run
+        hydra_subdir = rundir / hydra_cfg.output_subdir
+        command += ["-cp", str(hydra_subdir), "-cn", "config.yaml"]
+
     command += [f"hydra.run.dir={rundir}", f"hydra.job.name=train_ddp_process_{local_rank}", "hydra.output_subdir=null"]
     return command, cwd
@@ -77,6 +77,38 @@ def test_ddp_with_hydra_runjob(subdir, tmp_path, monkeypatch):
     assert len(logs) == devices
 
 
+@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
+@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
+@pytest.mark.parametrize("num_jobs", [1, 2])
+def test_ddp_with_hydra_multirunjob(num_jobs, tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+
+    # Save script locally
+    with open("temp.py", "w") as fn:
+        fn.write(script)
+
+    # Run CLI
+    devices = 2
+    sweep_dir = tmp_path / "hydra_output"
+    cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.sweep.dir={sweep_dir}"]
+    cmd += ["+foo=" + ",".join(str(i) for i in range(num_jobs)), "--multirun"]  # fake multirun params
+    run_process(cmd)
+
+    # Make sure there's exactly 1 config.yaml for each multirun job
+    saved_confs = list(sweep_dir.glob("**/config.yaml"))
+    assert len(saved_confs) == num_jobs
+
+    for config in saved_confs:  # Make sure the parameter was set and used for each job
+        cfg = OmegaConf.load(config)
+        local_rank = int(config.parent.parent.parts[-1])
+        assert cfg.devices == devices
+        assert cfg.foo == local_rank
+
+    # Make sure PL spawned jobs that are logged by Hydra
+    logs = list(sweep_dir.glob("**/*.log"))
+    assert len(logs) == devices * num_jobs
+
+
 def test_kill():
     launcher = _SubprocessScriptLauncher(Mock(), 1, 1)
     proc0 = Mock(autospec=subprocess.Popen)