Skip to content

Commit

Permalink
pass max_steps to scheduler in pretrain recipes
Browse files Browse the repository at this point in the history
Signed-off-by: Maanu Grover <maanug@nvidia.com>
  • Loading branch information
maanug-nv committed Sep 27, 2024
1 parent 5e66cad commit 06cca55
Show file tree
Hide file tree
Showing 17 changed files with 32 additions and 16 deletions.
5 changes: 3 additions & 2 deletions nemo/collections/llm/recipes/llama31_405b.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, fn=pretrain
) -> run.Partial:
"""
Create a pre-training recipe for Llama3.1 405B model.
Expand Down Expand Up @@ -167,10 +167,11 @@ def pretrain_recipe(
trainer=trainer(
num_nodes=num_nodes,
num_gpus_per_node=num_gpus_per_node,
max_steps=max_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4, max_steps=max_steps),
resume=default_resume(),
)
5 changes: 3 additions & 2 deletions nemo/collections/llm/recipes/llama3_70b.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, fn=pretrain
) -> run.Partial:
"""
Create a pre-training recipe for Llama3 70B model.
Expand Down Expand Up @@ -177,11 +177,12 @@ def pretrain_recipe(
trainer=trainer(
num_nodes=num_nodes,
num_gpus_per_node=num_gpus_per_node,
max_steps=max_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4, max_steps=max_steps),
resume=default_resume(),
)

Expand Down
5 changes: 3 additions & 2 deletions nemo/collections/llm/recipes/llama3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 1168251, fn=pretrain
) -> run.Partial:
"""
Create a pre-training recipe for Llama3 8B model.
Expand Down Expand Up @@ -178,11 +178,12 @@ def pretrain_recipe(
trainer=trainer(
num_nodes=num_nodes,
num_gpus_per_node=num_gpus_per_node,
max_steps=max_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4, max_steps=max_steps),
resume=default_resume(),
)

Expand Down
5 changes: 3 additions & 2 deletions nemo/collections/llm/recipes/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None, name: str = "default", num_nodes: int = 1, num_gpus_per_node: int = 8, max_steps: int = 100, fn=pretrain
) -> run.Partial:
"""
Create a pre-training recipe for Mistral 7B model.
Expand Down Expand Up @@ -177,11 +177,12 @@ def pretrain_recipe(
sequence_parallelism=False,
num_nodes=num_nodes,
num_gpus_per_node=num_gpus_per_node,
max_steps=max_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(MockDataModule, seq_length=4096, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4, max_steps=max_steps),
resume=default_resume(),
)

Expand Down
6 changes: 3 additions & 3 deletions nemo/collections/llm/recipes/mixtral_8x22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 16, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None, name: str = "default", num_nodes: int = 16, num_gpus_per_node: int = 8, max_steps: int = 1168251, fn=pretrain
) -> run.Partial:
"""
Create a pre-training recipe for Mixtral 8x22B model.
Expand Down Expand Up @@ -172,11 +172,11 @@ def pretrain_recipe(
fn,
model=model(),
trainer=trainer(
num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)]
num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, max_steps=max_steps, callbacks=[run.Config(TimingCallback)]
),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4, max_steps=max_steps),
resume=default_resume(),
)

Expand Down
5 changes: 3 additions & 2 deletions nemo/collections/llm/recipes/mixtral_8x3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, max_steps: int = 1168251, fn=pretrain
) -> run.Partial:
"""
Create a pre-training recipe for Mixtral 8x3B model.
Expand Down Expand Up @@ -174,11 +174,12 @@ def pretrain_recipe(
trainer=trainer(
num_nodes=num_nodes,
num_gpus_per_node=num_gpus_per_node,
max_steps=max_steps,
callbacks=[run.Config(TimingCallback)],
),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4, max_steps=max_steps),
resume=default_resume(),
)

Expand Down
6 changes: 3 additions & 3 deletions nemo/collections/llm/recipes/mixtral_8x7b.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def trainer(

@run.cli.factory(target=pretrain, name=NAME)
def pretrain_recipe(
dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, fn=pretrain
dir: Optional[str] = None, name: str = "default", num_nodes: int = 8, num_gpus_per_node: int = 8, max_steps: int = 1168251, fn=pretrain
) -> run.Partial:
"""
Create a pre-training recipe for Mixtral 8x7B model.
Expand Down Expand Up @@ -171,11 +171,11 @@ def pretrain_recipe(
fn,
model=model(),
trainer=trainer(
num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, callbacks=[run.Config(TimingCallback)]
num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, max_steps=max_steps, callbacks=[run.Config(TimingCallback)]
),
data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4),
optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4, max_steps=max_steps),
resume=default_resume(),
)

Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron3_4b.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron3_8b.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron4_15b.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron4_15b_16k.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron4_15b_64k.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron4_22b.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron4_22b_16k.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron4_22b_64k.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/llm/recipes/nemotron4_340b.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def pretrain_recipe(
precision=precision,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
max_steps=max_steps,
min_lr=min_lr,
max_lr=max_lr,
clip_grad=gradient_clip_val,
Expand Down
2 changes: 2 additions & 0 deletions nemo/collections/llm/recipes/optim/adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def distributed_fused_adam_with_cosine_annealing(
precision: str = "bf16-mixed", # or "16-mixed"
warmup_steps: int = 2000,
constant_steps: int = 0,
max_steps: int = 10,
max_lr: float = 1e-4,
min_lr: Optional[float] = None,
clip_grad: float = 1.0,
Expand All @@ -47,6 +48,7 @@ def distributed_fused_adam_with_cosine_annealing(
min_lr = min_lr or (0.1 * max_lr)
sched = run.Config(
CosineAnnealingScheduler,
max_steps=max_steps,
warmup_steps=warmup_steps,
constant_steps=constant_steps,
min_lr=min_lr,
Expand Down

0 comments on commit 06cca55

Please sign in to comment.