remove exp dir (NVIDIA#10460)

* remove exp dir * add back tests * Apply isort and black reformatting Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com> * add run_id to dir * fix dir->log_dir --------- Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com> Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
jgerh · Sep 23, 2024 · c6d1b7d · c6d1b7d
1 parent 9ed0d6c
commit c6d1b7d
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 16 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -5186,41 +5186,39 @@ jobs:
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
       
-  OPTIONAL_L2_NeMo_2_SSM_Pretraining:
+  L2_NeMo_2_SSM_Pretraining:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
 
         python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \
         --devices 1 \
-        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain \
         --max-steps 10 \
+        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain/${{ github.run_id }} \
         --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document
-
+      
       AFTER_SCRIPT: |
-        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain
-      IS_OPTIONAL: true
+        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain/${{ github.run_id }}
 
-  OPTIONAL_L2_NeMo_2_SSM_Finetuning:
+  L2_NeMo_2_SSM_Finetuning:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
 
         python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \
         --devices 1 \
-        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft \
         --max-steps 10 \
+        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }} \
         --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt
 
       AFTER_SCRIPT: |
-        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft
-      IS_OPTIONAL: true
+        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }}
 
   Nemo_CICD_Test:
     needs: 
@@ -5356,8 +5354,8 @@ jobs:
       #- OPTIONAL_L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
-      #- OPTIONAL_L2_NeMo_2_SSM_Pretraining
-      #- OPTIONAL_L2_NeMo_2_SSM_Finetuning
+      - L2_NeMo_2_SSM_Pretraining
+      - L2_NeMo_2_SSM_Finetuning
     if: always()
     runs-on: ubuntu-latest
     steps:  

diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
@@ -32,7 +32,9 @@ def get_args():
     parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0')
     parser.add_argument('--devices', type=int, help="Number of devices to use for training")
     parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
-    parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument(
+        '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to"
+    )
     parser.add_argument('--model-path', type=str, help="Path to model checkpoint")
     parser.add_argument(
         '--tokenizer-model-path', type=str, default=None, help="Path to tokenizer model, defaults to None"
@@ -98,7 +100,7 @@ def get_args():
     )
 
     nemo_logger = NeMoLogger(
-        dir=args.experiment_dir,
+        log_dir=args.experiment_dir,
     )
 
     data = llm.SquadDataModule(

diff --git a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
@@ -33,7 +33,9 @@ def get_args():
     parser = argparse.ArgumentParser(description='Train a Mamba model using NeMo 2.0')
     parser.add_argument('--devices', type=int, help="Number of devices to use for training")
     parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
-    parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument(
+        '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to"
+    )
     parser.add_argument('--data-path', type=str, help="Path to data file")
     parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model")