From 8dd038171f2352348e5af56c036a1d594839bde6 Mon Sep 17 00:00:00 2001
From: ataghibakhsh <ataghibakhsh@nvidia.com>
Date: Wed, 21 Aug 2024 10:51:23 -0700
Subject: [PATCH 1/4] fix mamba convert/ add test

---
 .github/workflows/cicd-main.yml               | 13 +++++++
 .../convert_mamba2_pyt_to_nemo.py             | 37 ++++++++++++-------
 2 files changed, 37 insertions(+), 13 deletions(-)
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index c77e135125c2..718b5af15a9a 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -159,6 +159,19 @@ jobs:
         rm -f /home/TestData/nlp/megatron_ir/sbert/sbert.nemo
         rm -rf /home/TestData/nlp/megatron_ir/sbert/model_weights
 
+  L2_Community_LLM_Checkpoints_tests_Mamba2:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+          python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py  \
+          --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \
+          --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo
+      AFTER_SCRIPT: |
+        rm -f /home/TestData/nlp/megatron_mamba/converted_mamba.nemo
+        rm -rf /home/TestData/nlp/megatron_mamba/model_weights
+
   L2_Community_LLM_Checkpoints_tests_Llama:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index 1a0a13709421..7aa1443f268a 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -26,12 +26,19 @@
 '''
 Example
 
-CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
+CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
                                 --input_name_or_path <path to the source pytorch model> \
                                 --output_path <path to target .nemo model> \
                                 --mamba_ssm_ngroups 8 \
                                 --precision bf16 \
                                 --tokenizer_model_dir <path to tokenizer.model, only set for 8b models, otherwise defaults to None>
+
+CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
+                                --input_name_or_path /home/ataghibakhsh/checkpoints/mamba2-130m/model_optim_rng.pt \
+                                --output_path /home/ataghibakhsh/checkpoints/mamba2-130m/mamba2_130m.nemo \
+                                --mamba_ssm_ngroups 1 \
+                                --precision bf16 \
+                                --tokenizer_model_dir <path to tokenizer.model, only set for 8b models, otherwise defaults to None>
 '''
 
 
@@ -63,10 +70,24 @@ def get_args():
 
 def convert(args):
 
-    checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')
+    checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')['model']
     new_state_dict = {}
 
     if 'backbone' in list(checkpoint_weights.keys())[0]:
+        if 'model' in list(checkpoint_weights.keys())[0]:
+            checkpoint_weights = {key.replace('model.', '', 1): value for key, value in checkpoint_weights.items()}
+
+            # Codestral Mamba Model Tokenizer Settings
+            tokenizer_library = 'megatron'
+            tokenizer_type = 'GPTSentencePieceTokenizer'
+            tokenizer_model = args.tokenizer_model_dir
+
+        else:
+
+            # Tri Dao and Albert Gu Mamba Model Tokenizer Settings
+            tokenizer_library = 'huggingface'
+            tokenizer_type = 'EleutherAI/gpt-neox-20b'
+            tokenizer_model = None
 
         layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'backbone\.layers\.\d+\.', key)]
         layer_numbers = set(int(re.search(r'backbone\.layers\.(\d+)\.', key).group(1)) for key in layer_keys)
@@ -103,11 +124,6 @@ def convert(args):
                     old_key = f'backbone.layers.{i}.{attr}'
                 new_state_dict[new_key] = checkpoint_weights[old_key]
 
-        # Tokenizer settings
-        tokenizer_library = 'huggingface'
-        tokenizer_type = 'EleutherAI/gpt-neox-20b'
-        tokenizer_model = None
-
     else:
 
         layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'decoder\.layers\.\d+\.', key)]
@@ -117,12 +133,7 @@ def convert(args):
         for key, value in checkpoint_weights.items():
             if '.norm.weight' in key and 'mixer' not in key:
                 key = key[:-11] + 'mixer.in_proj.layer_norm_weight'
-            new_state_dict["model." + key] = value
-
-        # Tokenizer settings
-        tokenizer_library = 'megatron'
-        tokenizer_type = 'GPTSentencePieceTokenizer'
-        tokenizer_model = args.tokenizer_model_dir
+            new_state_dict["model." + key] = value       
 
         # Tokenizer settings
         tokenizer_library = 'megatron'

From c5181e1b020d811830ae3beaeb4417980f3448f9 Mon Sep 17 00:00:00 2001
From: JRD971000 <JRD971000@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:52:57 +0000
Subject: [PATCH 2/4] Apply isort and black reformatting

Signed-off-by: JRD971000 <JRD971000@users.noreply.github.com>
---
 scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index 7aa1443f268a..427b52d9de3d 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -133,7 +133,7 @@ def convert(args):
         for key, value in checkpoint_weights.items():
             if '.norm.weight' in key and 'mixer' not in key:
                 key = key[:-11] + 'mixer.in_proj.layer_norm_weight'
-            new_state_dict["model." + key] = value       
+            new_state_dict["model." + key] = value
 
         # Tokenizer settings
         tokenizer_library = 'megatron'

From 6b7e60e0dc1ab339b3a8a99698a40451a94ae3aa Mon Sep 17 00:00:00 2001
From: ataghibakhsh <ataghibakhsh@nvidia.com>
Date: Wed, 21 Aug 2024 10:56:10 -0700
Subject: [PATCH 3/4] add mamba test

---
 .github/workflows/cicd-main.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 718b5af15a9a..d446883ae65a 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4756,6 +4756,7 @@ jobs:
       - L0_Unit_Tests_GPU
       #- OPTIONAL_L0_Unit_Tests_CPU
       - L2_Community_LLM_Checkpoints_tests_Bert
+      - L2_Community_LLM_Checkpoints_tests_Mamba2
       - L2_Community_LLM_Checkpoints_tests_Llama
       - L2_Community_LLM_Checkpoints_tests_StarCoder
       - L2_Community_LLM_Checkpoints_tests_Falcon

From e72d49b354074566fc78ec08e820ee619be6ca10 Mon Sep 17 00:00:00 2001
From: ataghibakhsh <ataghibakhsh@nvidia.com>
Date: Wed, 21 Aug 2024 11:30:55 -0700
Subject: [PATCH 4/4] fix ngroup in cicd

---
 .github/workflows/cicd-main.yml                            | 4 +++-
 .../checkpoint_converters/convert_mamba2_pyt_to_nemo.py    | 7 -------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index d446883ae65a..60e6ce724882 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -167,7 +167,9 @@ jobs:
       SCRIPT: |
           python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py  \
           --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \
-          --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo
+          --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo \
+          --precision=bf16 \
+          --mamba_ssm_ngroups 1
       AFTER_SCRIPT: |
         rm -f /home/TestData/nlp/megatron_mamba/converted_mamba.nemo
         rm -rf /home/TestData/nlp/megatron_mamba/model_weights
diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
index 427b52d9de3d..7a7484bf9c20 100644
--- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py
@@ -32,13 +32,6 @@
                                 --mamba_ssm_ngroups 8 \
                                 --precision bf16 \
                                 --tokenizer_model_dir <path to tokenizer.model, only set for 8b models, otherwise defaults to None>
-
-CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \
-                                --input_name_or_path /home/ataghibakhsh/checkpoints/mamba2-130m/model_optim_rng.pt \
-                                --output_path /home/ataghibakhsh/checkpoints/mamba2-130m/mamba2_130m.nemo \
-                                --mamba_ssm_ngroups 1 \
-                                --precision bf16 \
-                                --tokenizer_model_dir <path to tokenizer.model, only set for 8b models, otherwise defaults to None>
 '''