Skip to content

Commit

Permalink
more
Browse files Browse the repository at this point in the history
  • Loading branch information
albertz committed Nov 30, 2023
1 parent bb83aef commit ba3a651
Showing 1 changed file with 32 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,6 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None):

train_exp("from-scratch-train", config, gpu_mem=11)

train_exp(
"v6-11gb-f32-bs15k-accgrad4-mgpu2",
config_24gb_v6,
config_updates={
"batch_size": 15_000 * _batch_size_factor,
"accum_grad_multiple_step": 4,
"torch_distributed": {}, # multi-GPU
},
config_deletes=["torch_amp"], # f32
gpu_mem=11,
num_processes=2, # multi-GPU
)

train_exp( # dev-other 7.6
"base-24gb-bs30k-f32",
config_24gb,
Expand Down Expand Up @@ -303,6 +290,38 @@ def sis_run_with_prefix(prefix_name: Optional[str] = None):
},
)

train_exp(
"v6-11gb-f32-bs15k-accgrad4-mgpu2",
config_24gb_v6,
config_updates={
"batch_size": 15_000 * _batch_size_factor, # ~1305 steps/epoch
"accum_grad_multiple_step": 4,
"torch_distributed": {}, # multi-GPU
},
config_deletes=["torch_amp"], # f32
gpu_mem=11,
num_processes=2, # multi-GPU
)

train_exp(
"v6-11gb-f32-bs15k-accgrad4-mgpu4-lrlin1e_5_295k",
config_24gb_v6,
config_updates={
"batch_size": 15_000 * _batch_size_factor,
"accum_grad_multiple_step": 4, # note: per single GPU
"torch_distributed": {}, # multi-GPU
"learning_rate": 1.0,
"dynamic_learning_rate": dyn_lr_piecewise_linear,
# total steps after 500 epochs: ~652k
"learning_rate_piecewise_steps": [295_000, 590_000, 652_000],
"learning_rate_piecewise_values": [1e-5, 1e-3, 1e-5, 1e-6],
},
config_deletes=["torch_amp"], # f32
gpu_mem=11,
num_processes=4, # multi-GPU
num_epochs=500, # because of multi-GPU, 1 subepoch here is like 4 subepochs in single-GPU
)


_sis_prefix: Optional[str] = None

Expand Down

0 comments on commit ba3a651

Please sign in to comment.