Skip to content

Commit

Permalink
more
Browse files Browse the repository at this point in the history
  • Loading branch information
albertz committed Nov 26, 2023
1 parent e4c7354 commit bbe83ca
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def train_exp(
train_def=from_scratch_training,
num_epochs=num_epochs,
gpu_mem=gpu_mem,
horovod_num_processes=num_processes, # legacy name but also applies for Torch
num_processes=num_processes,
distributed_launch_cmd="torchrun" if num_processes else "mpirun",
)
recog_training_exp(prefix, task, model_with_checkpoint, recog_def=model_recog)
Expand Down
2 changes: 1 addition & 1 deletion users/zeyer/experiments/exp2023_04_25_rf/rz.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,6 @@ def py():
"base-24gb-v4-mgpu16",
config_24gb_v4,
config_updates={"torch_distributed": {}},
gpu_mem=1,
gpu_mem=32,
num_processes=16,
)
8 changes: 7 additions & 1 deletion users/zeyer/experiments/exp2023_04_25_rf/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def train(
init_params: Optional[Checkpoint] = None,
extra_hash: Any = None,
gpu_mem: Optional[int] = None,
num_processes: Optional[int] = None,
**kwargs,
) -> ModelWithCheckpoints:
"""
Expand Down Expand Up @@ -138,7 +139,12 @@ def train(

kwargs = kwargs.copy()
for k, v in dict(
log_verbosity=5, num_epochs=150, time_rqmt=80, mem_rqmt=30 if gpu_mem and gpu_mem > 11 else 15, cpu_rqmt=4
log_verbosity=5,
num_epochs=150,
time_rqmt=10, # TODO change back to 80, only temporary
mem_rqmt=30 if gpu_mem and gpu_mem > 11 else 15,
cpu_rqmt=4 if (not num_processes or num_processes <= 4) else 3,
horovod_num_processes=num_processes, # legacy name but also applies for Torch
).items():
kwargs.setdefault(k, v)
returnn_train_job = ReturnnTrainingJob(returnn_train_config, **kwargs)
Expand Down

0 comments on commit bbe83ca

Please sign in to comment.