Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multi-run with hydra + DDP #18175

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/lightning/fabric/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Added

- Added support for hydra multi-run with DDP ([#18175](https://github.com/Lightning-AI/lightning/pull/18175))


- Added support for the TPU-v4 architecture ([#17227](https://github.com/Lightning-AI/lightning/pull/17227))


Expand Down
18 changes: 14 additions & 4 deletions src/lightning/fabric/strategies/launchers/subprocess_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import os
import subprocess
import sys
from pathlib import Path
from typing import Any, Callable, Optional, Sequence, Tuple

from lightning_utilities.core.imports import RequirementCache
Expand Down Expand Up @@ -144,6 +145,7 @@ def _basic_subprocess_cmd() -> Sequence[str]:
def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
import __main__ # local import to avoid https://github.com/Lightning-AI/lightning/issues/15218
from hydra.core.hydra_config import HydraConfig
from hydra.types import RunMode
from hydra.utils import get_original_cwd, to_absolute_path

# when user is using hydra find the absolute path
Expand All @@ -152,10 +154,18 @@ def _hydra_subprocess_cmd(local_rank: int) -> Tuple[Sequence[str], str]:
else:
command = [sys.executable, "-m", __main__.__spec__.name]

command += sys.argv[1:]

cwd = get_original_cwd()
rundir = f'"{HydraConfig.get().run.dir}"'
# Set output_subdir null since we don't want different subprocesses trying to write to config.yaml
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this comment is still useful, we could move it down to the corresponding line.

hydra_cfg = HydraConfig.get()
rundir = Path(hydra_cfg.runtime.output_dir)

# For multi-run, hydra_cfg.mode is RunMode.MULTIRUN, else it's RunMode.RUN
if hydra_cfg.mode == RunMode.RUN: # Just run current command again
command += sys.argv[1:]
elif hydra_cfg.output_subdir is None:
raise RuntimeError("DDP with multirun requires saved config file")
else: # Used saved config for new run
hydra_subdir = rundir / hydra_cfg.output_subdir
command += ["-cp", str(hydra_subdir), "-cn", "config.yaml"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the name of the file always guaranteed to be config.yaml?


command += [f"hydra.run.dir={rundir}", f"hydra.job.name=train_ddp_process_{local_rank}", "hydra.output_subdir=null"]
return command, cwd
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,38 @@ def test_ddp_with_hydra_runjob(subdir, tmp_path, monkeypatch):
assert len(logs) == devices


@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True)
@pytest.mark.skipif(not _HYDRA_WITH_RUN_PROCESS, reason=str(_HYDRA_WITH_RUN_PROCESS))
@pytest.mark.parametrize("num_jobs", [1, 2])
def test_ddp_with_hydra_multirunjob(num_jobs, tmp_path, monkeypatch):
monkeypatch.chdir(tmp_path)

# Save script locally
with open("temp.py", "w") as fn:
fn.write(script)

# Run CLI
devices = 2
sweep_dir = tmp_path / "hydra_output"
cmd = [sys.executable, "temp.py", f"+devices={devices}", '+strategy="ddp"', f"hydra.sweep.dir={sweep_dir}"]
cmd += ["+foo=" + ",".join(str(i) for i in range(num_jobs)), "--multirun"] # fake multirun params
run_process(cmd)

# Make sure there's exactly 1 config.yaml for each multirun job
saved_confs = list(sweep_dir.glob("**/config.yaml"))
assert len(saved_confs) == num_jobs

for config in saved_confs: # Make sure the parameter was set and used for each job
cfg = OmegaConf.load(config)
local_rank = int(config.parent.parent.parts[-1])
assert cfg.devices == devices
assert cfg.foo == local_rank

# Make sure PL spawned jobs that are logged by Hydra
logs = list(sweep_dir.glob("**/*.log"))
assert len(logs) == devices * num_jobs


def test_kill():
launcher = _SubprocessScriptLauncher(Mock(), 1, 1)
proc0 = Mock(autospec=subprocess.Popen)
Expand Down