From 8dd038171f2352348e5af56c036a1d594839bde6 Mon Sep 17 00:00:00 2001 From: ataghibakhsh Date: Wed, 21 Aug 2024 10:51:23 -0700 Subject: [PATCH 1/4] fix mamba convert/ add test --- .github/workflows/cicd-main.yml | 13 +++++++ .../convert_mamba2_pyt_to_nemo.py | 37 ++++++++++++------- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index c77e135125c2..718b5af15a9a 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -159,6 +159,19 @@ jobs: rm -f /home/TestData/nlp/megatron_ir/sbert/sbert.nemo rm -rf /home/TestData/nlp/megatron_ir/sbert/model_weights + L2_Community_LLM_Checkpoints_tests_Mamba2: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + with: + RUNNER: self-hosted-azure + SCRIPT: | + python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ + --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \ + --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo + AFTER_SCRIPT: | + rm -f /home/TestData/nlp/megatron_mamba/converted_mamba.nemo + rm -rf /home/TestData/nlp/megatron_mamba/model_weights + L2_Community_LLM_Checkpoints_tests_Llama: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py index 1a0a13709421..7aa1443f268a 100644 --- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py @@ -26,12 +26,19 @@ ''' Example -CUDA_VISIBLE_DEVICES="0" python /NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ +CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ --input_name_or_path \ --output_path \ --mamba_ssm_ngroups 8 \ --precision bf16 \ --tokenizer_model_dir + +CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ + --input_name_or_path /home/ataghibakhsh/checkpoints/mamba2-130m/model_optim_rng.pt \ + --output_path /home/ataghibakhsh/checkpoints/mamba2-130m/mamba2_130m.nemo \ + --mamba_ssm_ngroups 1 \ + --precision bf16 \ + --tokenizer_model_dir ''' @@ -63,10 +70,24 @@ def get_args(): def convert(args): - checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu') + checkpoint_weights = torch.load(args.input_name_or_path, map_location='cpu')['model'] new_state_dict = {} if 'backbone' in list(checkpoint_weights.keys())[0]: + if 'model' in list(checkpoint_weights.keys())[0]: + checkpoint_weights = {key.replace('model.', '', 1): value for key, value in checkpoint_weights.items()} + + # Codestral Mamba Model Tokenizer Settings + tokenizer_library = 'megatron' + tokenizer_type = 'GPTSentencePieceTokenizer' + tokenizer_model = args.tokenizer_model_dir + + else: + + # Tri Dao and Albert Gu Mamba Model Tokenizer Settings + tokenizer_library = 'huggingface' + tokenizer_type = 'EleutherAI/gpt-neox-20b' + tokenizer_model = None layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'backbone\.layers\.\d+\.', key)] layer_numbers = set(int(re.search(r'backbone\.layers\.(\d+)\.', key).group(1)) for key in layer_keys) @@ -103,11 +124,6 @@ def convert(args): old_key = f'backbone.layers.{i}.{attr}' new_state_dict[new_key] = checkpoint_weights[old_key] - # Tokenizer settings - tokenizer_library = 'huggingface' - tokenizer_type = 'EleutherAI/gpt-neox-20b' - tokenizer_model = None - else: layer_keys = [key for key in checkpoint_weights.keys() if re.match(r'decoder\.layers\.\d+\.', key)] @@ -117,12 +133,7 @@ def convert(args): for key, value in checkpoint_weights.items(): if '.norm.weight' in key and 'mixer' not in key: key = key[:-11] + 'mixer.in_proj.layer_norm_weight' - new_state_dict["model." + key] = value - - # Tokenizer settings - tokenizer_library = 'megatron' - tokenizer_type = 'GPTSentencePieceTokenizer' - tokenizer_model = args.tokenizer_model_dir + new_state_dict["model." + key] = value # Tokenizer settings tokenizer_library = 'megatron' From c5181e1b020d811830ae3beaeb4417980f3448f9 Mon Sep 17 00:00:00 2001 From: JRD971000 Date: Wed, 21 Aug 2024 17:52:57 +0000 Subject: [PATCH 2/4] Apply isort and black reformatting Signed-off-by: JRD971000 --- scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py index 7aa1443f268a..427b52d9de3d 100644 --- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py @@ -133,7 +133,7 @@ def convert(args): for key, value in checkpoint_weights.items(): if '.norm.weight' in key and 'mixer' not in key: key = key[:-11] + 'mixer.in_proj.layer_norm_weight' - new_state_dict["model." + key] = value + new_state_dict["model." + key] = value # Tokenizer settings tokenizer_library = 'megatron' From 6b7e60e0dc1ab339b3a8a99698a40451a94ae3aa Mon Sep 17 00:00:00 2001 From: ataghibakhsh Date: Wed, 21 Aug 2024 10:56:10 -0700 Subject: [PATCH 3/4] add mamba test --- .github/workflows/cicd-main.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 718b5af15a9a..d446883ae65a 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4756,6 +4756,7 @@ jobs: - L0_Unit_Tests_GPU #- OPTIONAL_L0_Unit_Tests_CPU - L2_Community_LLM_Checkpoints_tests_Bert + - L2_Community_LLM_Checkpoints_tests_Mamba2 - L2_Community_LLM_Checkpoints_tests_Llama - L2_Community_LLM_Checkpoints_tests_StarCoder - L2_Community_LLM_Checkpoints_tests_Falcon From e72d49b354074566fc78ec08e820ee619be6ca10 Mon Sep 17 00:00:00 2001 From: ataghibakhsh Date: Wed, 21 Aug 2024 11:30:55 -0700 Subject: [PATCH 4/4] fix ngroup in cicd --- .github/workflows/cicd-main.yml | 4 +++- .../checkpoint_converters/convert_mamba2_pyt_to_nemo.py | 7 ------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index d446883ae65a..60e6ce724882 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -167,7 +167,9 @@ jobs: SCRIPT: | python scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ --input_name_or_path /home/TestData/nlp/megatron_mamba/model_optim_rng.pt \ - --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo + --output_path /home/TestData/nlp/megatron_mamba/converted_mamba.nemo \ + --precision=bf16 \ + --mamba_ssm_ngroups 1 AFTER_SCRIPT: | rm -f /home/TestData/nlp/megatron_mamba/converted_mamba.nemo rm -rf /home/TestData/nlp/megatron_mamba/model_weights diff --git a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py index 427b52d9de3d..7a7484bf9c20 100644 --- a/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py +++ b/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py @@ -32,13 +32,6 @@ --mamba_ssm_ngroups 8 \ --precision bf16 \ --tokenizer_model_dir - -CUDA_VISIBLE_DEVICES="0" python /opt/NeMo/scripts/checkpoint_converters/convert_mamba2_pyt_to_nemo.py \ - --input_name_or_path /home/ataghibakhsh/checkpoints/mamba2-130m/model_optim_rng.pt \ - --output_path /home/ataghibakhsh/checkpoints/mamba2-130m/mamba2_130m.nemo \ - --mamba_ssm_ngroups 1 \ - --precision bf16 \ - --tokenizer_model_dir '''