From c6d1b7dd30c6672f2ca0e9f2d916edfa31676d75 Mon Sep 17 00:00:00 2001 From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com> Date: Mon, 23 Sep 2024 09:51:43 -0500 Subject: [PATCH] remove exp dir (#10460) * remove exp dir * add back tests * Apply isort and black reformatting Signed-off-by: JRD971000 * add run_id to dir * fix dir->log_dir --------- Signed-off-by: JRD971000 Co-authored-by: JRD971000 --- .github/workflows/cicd-main.yml | 24 +++++++++---------- .../llm/gpt/model/megatron_ssm_finetuning.py | 6 +++-- .../llm/gpt/model/megatron_ssm_pretraining.py | 4 +++- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 2fa47ba93475..730c363b41f3 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -5186,41 +5186,39 @@ jobs: rm -rf tests/collections/llm/gpt_pretrain_results rm -rf tests/collections/llm/gpt_index_mappings - OPTIONAL_L2_NeMo_2_SSM_Pretraining: + L2_NeMo_2_SSM_Pretraining: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \ --devices 1 \ - --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain \ --max-steps 10 \ + --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain/${{ github.run_id }} \ --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document - + AFTER_SCRIPT: | - rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain - IS_OPTIONAL: true + rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain/${{ github.run_id }} - OPTIONAL_L2_NeMo_2_SSM_Finetuning: + L2_NeMo_2_SSM_Finetuning: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \ --devices 1 \ - --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft \ --max-steps 10 \ + --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }} \ --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt AFTER_SCRIPT: | - rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft - IS_OPTIONAL: true + rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }} Nemo_CICD_Test: needs: @@ -5356,8 +5354,8 @@ jobs: #- OPTIONAL_L2_Stable_Diffusion_Training - L2_NeMo_2_GPT_Pretraining_no_transformer_engine - L2_NeMo_2_GPT_DDP_Param_Parity_check - #- OPTIONAL_L2_NeMo_2_SSM_Pretraining - #- OPTIONAL_L2_NeMo_2_SSM_Finetuning + - L2_NeMo_2_SSM_Pretraining + - L2_NeMo_2_SSM_Finetuning if: always() runs-on: ubuntu-latest steps: diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py index 4b748c298105..67174974f9a3 100644 --- a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py +++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py @@ -32,7 +32,9 @@ def get_args(): parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0') parser.add_argument('--devices', type=int, help="Number of devices to use for training") parser.add_argument('--max-steps', type=int, help="Number of steps to train for") - parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") + parser.add_argument( + '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to" + ) parser.add_argument('--model-path', type=str, help="Path to model checkpoint") parser.add_argument( '--tokenizer-model-path', type=str, default=None, help="Path to tokenizer model, defaults to None" @@ -98,7 +100,7 @@ def get_args(): ) nemo_logger = NeMoLogger( - dir=args.experiment_dir, + log_dir=args.experiment_dir, ) data = llm.SquadDataModule( diff --git a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py index 30284bb5b6f1..d7ecaafaaf8c 100644 --- a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py +++ b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py @@ -33,7 +33,9 @@ def get_args(): parser = argparse.ArgumentParser(description='Train a Mamba model using NeMo 2.0') parser.add_argument('--devices', type=int, help="Number of devices to use for training") parser.add_argument('--max-steps', type=int, help="Number of steps to train for") - parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to") + parser.add_argument( + '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to" + ) parser.add_argument('--data-path', type=str, help="Path to data file") parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model")