From c6d1b7dd30c6672f2ca0e9f2d916edfa31676d75 Mon Sep 17 00:00:00 2001
From: Ali Taghibakhshi <71892896+JRD971000@users.noreply.github.com>
Date: Mon, 23 Sep 2024 09:51:43 -0500
Subject: [PATCH] remove exp dir (#10460)

* remove exp dir

* add back tests

* Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>

* add run_id to dir

* fix dir->log_dir

---------

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
Co-authored-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               | 24 +++++++++----------
 .../llm/gpt/model/megatron_ssm_finetuning.py  |  6 +++--
 .../llm/gpt/model/megatron_ssm_pretraining.py |  4 +++-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2fa47ba93475..730c363b41f3 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -5186,41 +5186,39 @@ jobs:
         rm -rf tests/collections/llm/gpt_pretrain_results
         rm -rf tests/collections/llm/gpt_index_mappings
       
-  OPTIONAL_L2_NeMo_2_SSM_Pretraining:
+  L2_NeMo_2_SSM_Pretraining:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
 
         python tests/collections/llm/gpt/model/megatron_ssm_pretraining.py \
         --devices 1 \
-        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain \
         --max-steps 10 \
+        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain/${{ github.run_id }} \
         --data-path /home/TestData/nlp/megatron_mamba/toy_ssm_dataset/legal_pile_text_document
-
+      
       AFTER_SCRIPT: |
-        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain
-      IS_OPTIONAL: true
+        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_pretrain/${{ github.run_id }}
 
-  OPTIONAL_L2_NeMo_2_SSM_Finetuning:
+  L2_NeMo_2_SSM_Finetuning:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_SSM_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
 
         python tests/collections/llm/gpt/model/megatron_ssm_finetuning.py \
         --devices 1 \
-        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft \
         --max-steps 10 \
+        --experiment-dir /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }} \
         --model-path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt
 
       AFTER_SCRIPT: |
-        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft
-      IS_OPTIONAL: true
+        rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }}
 
   Nemo_CICD_Test:
     needs: 
@@ -5356,8 +5354,8 @@ jobs:
       #- OPTIONAL_L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
-      #- OPTIONAL_L2_NeMo_2_SSM_Pretraining
-      #- OPTIONAL_L2_NeMo_2_SSM_Finetuning
+      - L2_NeMo_2_SSM_Pretraining
+      - L2_NeMo_2_SSM_Finetuning
     if: always()
     runs-on: ubuntu-latest
     steps:  
diff --git a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
index 4b748c298105..67174974f9a3 100644
--- a/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
+++ b/tests/collections/llm/gpt/model/megatron_ssm_finetuning.py
@@ -32,7 +32,9 @@ def get_args():
     parser = argparse.ArgumentParser(description='Train a small GPT model using NeMo 2.0')
     parser.add_argument('--devices', type=int, help="Number of devices to use for training")
     parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
-    parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument(
+        '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to"
+    )
     parser.add_argument('--model-path', type=str, help="Path to model checkpoint")
     parser.add_argument(
         '--tokenizer-model-path', type=str, default=None, help="Path to tokenizer model, defaults to None"
@@ -98,7 +100,7 @@ def get_args():
     )
 
     nemo_logger = NeMoLogger(
-        dir=args.experiment_dir,
+        log_dir=args.experiment_dir,
     )
 
     data = llm.SquadDataModule(
diff --git a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
index 30284bb5b6f1..d7ecaafaaf8c 100644
--- a/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
+++ b/tests/collections/llm/gpt/model/megatron_ssm_pretraining.py
@@ -33,7 +33,9 @@ def get_args():
     parser = argparse.ArgumentParser(description='Train a Mamba model using NeMo 2.0')
     parser.add_argument('--devices', type=int, help="Number of devices to use for training")
     parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
-    parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument(
+        '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to"
+    )
     parser.add_argument('--data-path', type=str, help="Path to data file")
     parser.add_argument('--tokenizer-path', type=str, default=None, help="Path to tokenizer model")