diff --git a/train_configs/llama3_405b.toml b/train_configs/llama3_405b.toml
index 9ed74e3c..e8e89b7e 100644
--- a/train_configs/llama3_405b.toml
+++ b/train_configs/llama3_405b.toml
@@ -1,5 +1,6 @@
 # torchtitan Config.toml
-# NOTE: this toml config is a preset for 64 A100 GPUs.
+# NOTE: this toml config is a preset for 128 H100 GPUs.
+# This is still WIP so please use with cautions.
 
 [job]
 dump_folder = "./outputs"
@@ -23,14 +24,14 @@ tokenizer_path = "./torchtitan/datasets/tokenizer/original/tokenizer.model"
 
 [optimizer]
 name = "AdamW"
-lr = 1.5e-4
+lr = 0.8e-4 # WIP right now.
 
 [training]
-batch_size = 16
+batch_size = 1
 seq_len = 8192
-warmup_steps = 200  # lr scheduler warm up, normally 20% of the train steps
+warmup_steps = 600  # lr scheduler warm up, normally 20% of the train steps
 max_norm = 1.0  # grad norm clipping
-steps = 1000
+steps = 3000
 data_parallel_degree = -1
 tensor_parallel_degree = 8  # 8-way TP
 enable_float8_linear = false
@@ -50,4 +51,4 @@ export_dtype = "float32"
 async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
 
 [activation_checkpoint]
-mode = 'full'
+mode = 'full' # ['none', 'selective', 'full']