Skip to content

Commit

Permalink
Merge branch 'main' of github.com:NVIDIA/NeMo into ashors/nemo-ux-loa…
Browse files Browse the repository at this point in the history
…d-on-device
  • Loading branch information
ashors1 committed Jul 10, 2024
2 parents f0f1ca9 + b4821e1 commit cd8f28e
Show file tree
Hide file tree
Showing 48 changed files with 2,527 additions and 236 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ on:
description: Last 2000 characters of the test step's log
value: ${{ jobs.main.outputs.log }}
jobs:
runner-auto-clean:
runs-on: ${{ inputs.RUNNER }}
steps:
- name: Docker system cleanup
run: |
docker system prune -a --filter "until=48h" --force
main:
runs-on: ${{ inputs.RUNNER }}
outputs:
Expand Down
106 changes: 102 additions & 4 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2391,7 +2391,7 @@ jobs:
L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
runs-on: self-hosted-azure-gpus-2-h100
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
Expand All @@ -2403,6 +2403,21 @@ jobs:
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
env:
# This is to improve p2p overlap on H100
NVTE_FWD_LAYERNORM_SM_MARGIN: 8
NVTE_BWD_LAYERNORM_SM_MARGIN: 8
TORCH_NCCL_AVOID_RECORD_STREAMS: 1
NCCL_MIN_NCHANNELS: 4
# TP overlap is not supported in docker environment
#NVTE_UB_SPLIT_RS: 0
#NVTE_UB_ATOMIC_GEMM_RS: 1
#NVTE_RS_STRIDED_ATOMIC: 1
#NVTE_UB_FP8_RS: 1
# Increase p2p chunksize to 2MB
NCCL_P2P_NET_CHUNKSIZE: 2097152
# Disable gc when switching to/from validation steps
NEMO_MANUAL_GC_IN_VALIDATION: 0
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand All @@ -2417,8 +2432,17 @@ jobs:
trainer.max_steps=3 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=1 \
model.optim.sched.constant_steps=1 \
Expand Down Expand Up @@ -2452,8 +2476,17 @@ jobs:
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.tensor_model_parallel_size=2 \
model.optim.name=fused_adam \
model.optim.name=distributed_fused_adam \
model.optim.lr=2e-4 \
model.optim.sched.warmup_steps=2 \
model.optim.sched.constant_steps=2 \
Expand Down Expand Up @@ -2945,10 +2978,11 @@ jobs:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
Expand All @@ -2957,6 +2991,15 @@ jobs:
trainer.precision=bf16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.mcore_gpt=True \
Expand All @@ -2981,12 +3024,15 @@ jobs:
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
python examples/nlp/language_modeling/megatron_gpt_pretraining.py \
trainer.devices=2 \
trainer.accelerator=gpu \
trainer.log_every_n_steps=1 \
trainer.val_check_interval=2 \
trainer.limit_val_batches=2 \
Expand All @@ -2998,6 +3044,15 @@ jobs:
model.megatron_amp_O2=True \
exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \
exp_manager.resume_if_exists=True \
++model.transformer_engine=True \
++model.fp8=True \
++model.fp8_hybrid=True \
++model.fp8_amax_history_len=1024 \
++model.fp8_amax_compute_algo=max \
++model.reduce_amax=True \
++model.use_te_rng_tracker=True \
++model.name=megatron_gpt_full_te_layer_autocast \
model.ub_tp_comm_overlap=False \
model.pipeline_model_parallel_size=2 \
model.tensor_model_parallel_size=1 \
model.optim.name=distributed_fused_adam \
Expand All @@ -3020,7 +3075,9 @@ jobs:
model.hidden_size=256 \
model.num_attention_heads=8 \
model.activations_checkpoint_method=block \
model.activations_checkpoint_granularity=full \
model.activations_checkpoint_num_layers=1 \
model.data.validation_drop_last=False \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \
model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings
AFTER_SCRIPT: |
Expand Down Expand Up @@ -3141,6 +3198,47 @@ jobs:
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_Megatron_GPT_Reranker:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
rm -rf /home/TestData/nlp/megatron_ir/working_dir
python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
model.global_batch_size=4 \
model.micro_batch_size=4 \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
model.data.validation_ds.write_embeddings_to_file=True \
model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
rm -rf /home/TestData/nlp/megatron_ir/working_dir
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"

L2_Megatron_GPT_Embedding:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*.pkl
#*.ipynb
output
output_2048
result
*.pt
tests/data/asr
Expand Down Expand Up @@ -179,3 +180,4 @@ examples/neural_graphs/*.yml
.hydra/
nemo_experiments/

slurm*.out
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ trainer:
enable_model_summary: True
limit_val_batches: 0


exp_manager:
exp_dir: null
name: ${name}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ model:
lossconfig:
target: torch.nn.Identity



conditioner_config:
_target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner
emb_models:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ model:
target: torch.nn.Identity



conditioner_config:
_target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner
emb_models:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ infer:
sampling:
base:
sampler: EulerEDMSampler
width: 256
height: 256
steps: 40
width: 512
height: 512
steps: 50
discretization: "LegacyDDPMDiscretization"
guider: "VanillaCFG"
thresholder: "None"
Expand All @@ -48,8 +48,8 @@ sampling:
s_noise: 1.0
eta: 1.0
order: 4
orig_width: 1024
orig_height: 1024
orig_width: 512
orig_height: 512
crop_coords_top: 0
crop_coords_left: 0
aesthetic_score: 5.0
Expand Down
Loading

0 comments on commit cd8f28e

Please sign in to comment.