diff --git a/users/zeyer/experiments/exp2023_04_25_rf/conformer_import_moh_att_2023_06_30.py b/users/zeyer/experiments/exp2023_04_25_rf/conformer_import_moh_att_2023_06_30.py index 9d9dbaf06..29a74651c 100644 --- a/users/zeyer/experiments/exp2023_04_25_rf/conformer_import_moh_att_2023_06_30.py +++ b/users/zeyer/experiments/exp2023_04_25_rf/conformer_import_moh_att_2023_06_30.py @@ -92,19 +92,6 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): train_exp("from-scratch-train", config, gpu_mem=11) - train_exp( - "v6-11gb-f32-bs15k-accgrad4-mgpu2", - config_24gb_v6, - config_updates={ - "batch_size": 15_000 * _batch_size_factor, - "accum_grad_multiple_step": 4, - "torch_distributed": {}, # multi-GPU - }, - config_deletes=["torch_amp"], # f32 - gpu_mem=11, - num_processes=2, # multi-GPU - ) - train_exp( # dev-other 7.6 "base-24gb-bs30k-f32", config_24gb, @@ -303,6 +290,38 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None): }, ) + train_exp( + "v6-11gb-f32-bs15k-accgrad4-mgpu2", + config_24gb_v6, + config_updates={ + "batch_size": 15_000 * _batch_size_factor, # ~1305 steps/epoch + "accum_grad_multiple_step": 4, + "torch_distributed": {}, # multi-GPU + }, + config_deletes=["torch_amp"], # f32 + gpu_mem=11, + num_processes=2, # multi-GPU + ) + + train_exp( + "v6-11gb-f32-bs15k-accgrad4-mgpu4-lrlin1e_5_295k", + config_24gb_v6, + config_updates={ + "batch_size": 15_000 * _batch_size_factor, + "accum_grad_multiple_step": 4, # note: per single GPU + "torch_distributed": {}, # multi-GPU + "learning_rate": 1.0, + "dynamic_learning_rate": dyn_lr_piecewise_linear, + # total steps after 500 epochs: ~652k + "learning_rate_piecewise_steps": [295_000, 590_000, 652_000], + "learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6], + }, + config_deletes=["torch_amp"], # f32 + gpu_mem=11, + num_processes=4, # multi-GPU + num_epochs=500, # because of multi-GPU, 1 subepoch here is like 4 subepochs in single-GPU + ) + _sis_prefix: Optional[str] = None