diff --git a/train_configs/llama3_405b.toml b/train_configs/llama3_405b.toml index 9ed74e3c..e8e89b7e 100644 --- a/train_configs/llama3_405b.toml +++ b/train_configs/llama3_405b.toml @@ -1,5 +1,6 @@ # torchtitan Config.toml -# NOTE: this toml config is a preset for 64 A100 GPUs. +# NOTE: this toml config is a preset for 128 H100 GPUs. +# This is still WIP so please use with cautions. [job] dump_folder = "./outputs" @@ -23,14 +24,14 @@ tokenizer_path = "./torchtitan/datasets/tokenizer/original/tokenizer.model" [optimizer] name = "AdamW" -lr = 1.5e-4 +lr = 0.8e-4 # WIP right now. [training] -batch_size = 16 +batch_size = 1 seq_len = 8192 -warmup_steps = 200 # lr scheduler warm up, normally 20% of the train steps +warmup_steps = 600 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping -steps = 1000 +steps = 3000 data_parallel_degree = -1 tensor_parallel_degree = 8 # 8-way TP enable_float8_linear = false @@ -50,4 +51,4 @@ export_dtype = "float32" async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] [activation_checkpoint] -mode = 'full' +mode = 'full' # ['none', 'selective', 'full']