Skip to content

Commit

Permalink
dummy test train job
Browse files Browse the repository at this point in the history
  • Loading branch information
albertz committed Dec 1, 2023
1 parent 7b1a29d commit 3b904cc
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,19 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None):
num_epochs=500, # because of multi-GPU, 1 subepoch here is like 4 subepochs in single-GPU
)

train_exp(
"tmp-dummy-test",
config_24gb_v6,
config_updates={"startup_callback": _tmp_dummy_startup_callback_exit, "dummy_test": 42},
gpu_mem=24,
num_epochs=1,
time_rqmt=1,
)


def _tmp_dummy_startup_callback_exit(**_kwargs):
raise SystemExit("dummy test exit")


_sis_prefix: Optional[str] = None

Expand Down Expand Up @@ -384,6 +397,7 @@ def train_exp(
gpu_mem: Optional[int] = 24,
num_processes: Optional[int] = None,
fine_tune: Optional[Union[int, List[Tuple[int, Dict[str, Any]]]]] = None,
time_rqmt: Optional[int] = None,
) -> ModelWithCheckpoints:
"""
Train experiment
Expand All @@ -410,6 +424,7 @@ def train_exp(
gpu_mem=gpu_mem,
num_processes=num_processes,
distributed_launch_cmd="torchrun" if num_processes else "mpirun",
time_rqmt=time_rqmt,
)
recog_training_exp(prefix, task, model_with_checkpoint, recog_def=model_recog)

Expand Down
3 changes: 2 additions & 1 deletion users/zeyer/experiments/exp2023_04_25_rf/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,8 @@ def train(
cpu_rqmt=4 if (not num_processes or num_processes <= 4) else 3,
horovod_num_processes=num_processes, # legacy name but also applies for Torch
).items():
kwargs.setdefault(k, v)
if k not in kwargs or kwargs[k] is None:
kwargs[k] = v
returnn_train_job = ReturnnTrainingJob(returnn_train_config, **kwargs)
returnn_train_job.add_alias(prefix_name + "/train")
if gpu_mem:
Expand Down

0 comments on commit 3b904cc

Please sign in to comment.