salm export trtllm (#10245)

Signed-off-by: slyne deng <slyned@nvidia.com> Co-authored-by: slyne deng <slyned@nvidia.com>
NVIDIA · Aug 26, 2024 · 69973f9 · 69973f9
1 parent ea8f49b
commit 69973f9
Show file tree

Hide file tree

Showing 9 changed files with 810 additions and 28 deletions.
diff --git a/examples/multimodal/speech_llm/export/README.md b/examples/multimodal/speech_llm/export/README.md
@@ -0,0 +1,83 @@
+## Setup
+In this part, we are going to export SALM model into TRTLLM.
+First, let's download the [SALM nemo model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speechllm_fc_llama2_7b/) from NVIDIA ngc.
+
+```bash
+wget --content-disposition 'https://api.ngc.nvidia.com/v2/models/org/nvidia/team/nemo/speechllm_fc_llama2_7b/1.23.1/files?redirect=true&path=speechllm_fc_llama2_7b.nemo' -O speechllm_fc_llama2_7b.nemo
+```
+
+Then, we need to extract the different parts of SALM.
+```bash
+output=$PWD/output
+python3 extract_salm_weights.py --model_file_path=speechllm_fc_llama2_7b.nemo --output_dir=$output
+```
+It takes a while to run the above command.
+
+Under the `output` dir, you'll see:
+```
+output
+    |___speechllm_fc_llama2_7b_lora.nemo
+    |___speechllm_fc_llama2_7b_perception
+    |         |____model_config.yaml
+    |         |____model_weights.ckpt
+    |___speechllm_fc_llama2_7b_llm.nemo
+    |___ xxx.tokenizer.model
+```
+
+After we get the lora nemo model and llm nemo model, we can merge the lora part into the llm by:
+```bash
+python /opt/NeMo/scripts/nlp_language_modeling/merge_lora_weights/merge.py \
+    trainer.accelerator=gpu \
+    tensor_model_parallel_size=1 \
+    pipeline_model_parallel_size=1 \
+    gpt_model_file=output/speechllm_fc_llama2_7b_llm.nemo \
+    lora_model_path=output/speechllm_fc_llama2_7b_lora.nemo \
+    merged_model_path=speechllm_fc_llama2_7b_llm_merged.nemo
+```
+
+Now we are able to export the engine by:
+```bash
+python3 export_salm.py \
+    model.perception_model_path=output/speechllm_fc_llama2_7b_perception \
+    model.llm_model_path=output/speechllm_fc_llama2_7b_llm_merged.nemo
+```
+
+You should be able to get the generated engines under `./salm` folder. To run the engines, you may run:
+```python
+from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
+
+output_dir = "/ws/salm" # the engine directory
+trt_llm_exporter = TensorRTMMExporter(model_dir=output_dir, load_model=True, modality='audio')
+input_text = "Q: what's the transcription of the audio? A:"
+input_media = '/ws/data/test_audio.wav'
+print(trt_llm_exporter.forward(input_text, input_media))
+
+```
+
+## Deploy
+If you want to generate the engines and deploy them with Triton Inference Server, you may also run:
+
+```bash
+python3 NeMo/scripts/deploy/multimodal/deploy_triton.py \
+        --modality="audio" \
+        --visual_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_perception \
+        --llm_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_llm_merged.nemo \
+        --llm_model_type="llama" \
+        --model_type="salm" \
+        --triton_model_name="salm" \
+        --max_input_len=4096 \
+        --max_output_len=256 \
+        --max_multimodal_len=3072 \
+        --triton_model_repository=/tmp/trt_model_dir/
+```
+
+And on client side, you may run:
+```bash
+python3 NeMo/scripts/deploy/multimodal/query.py \
+        --model_name="salm" \
+        --model_type="salm" \
+        --input_text="Q: what's the transcription of the audio? A:" \
+        --input_media=/ws/data/test_audio.wav
+```
+
+For more details, please check `NeMo/scripts/deploy/multimodal/deploy_triton.py` and ` NeMo/scripts/deploy/multimodal/query.py`.
diff --git a/examples/multimodal/speech_llm/export/conf/salm_export.yaml b/examples/multimodal/speech_llm/export/conf/salm_export.yaml
@@ -0,0 +1,16 @@
+name: speechllm_salm
+infer:
+  output_dir: ./salm
+  max_batch_size: 1
+  tensor_parallelism: 1
+  max_input_len: 4096
+  max_output_len: 256
+  max_multimodal_len: 3072
+  perception_max_batch_size: 1
+
+model:
+  type: salm
+  precision: float16
+  perception_model_path: /path/to/speechllm_llama2_7b_perception
+  llm_model_path: /path/to/speechllm_llama2_7b_llm.nemo
+  llm_model_type: llama 
diff --git a/examples/multimodal/speech_llm/export/export_salm.py b/examples/multimodal/speech_llm/export/export_salm.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.core.config import hydra_runner
+from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
+
+
+@hydra_runner(config_path='conf', config_name='salm_export')
+def main(cfg):
+    exporter = TensorRTMMExporter(model_dir=cfg.infer.output_dir, load_model=False, modality='audio')
+    exporter.export(
+        visual_checkpoint_path=cfg.model.perception_model_path,
+        llm_checkpoint_path=cfg.model.llm_model_path,
+        model_type=cfg.model.type,
+        llm_model_type=cfg.model.llm_model_type,
+        tensor_parallel_size=cfg.infer.tensor_parallelism,
+        max_input_len=cfg.infer.max_input_len,
+        max_output_len=cfg.infer.max_output_len,
+        vision_max_batch_size=cfg.infer.perception_max_batch_size,
+        max_batch_size=cfg.infer.max_batch_size,
+        max_multimodal_len=cfg.infer.max_multimodal_len,
+        dtype=cfg.model.precision,
+        load_model=False,
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/multimodal/speech_llm/export/extract_salm_weights.py b/examples/multimodal/speech_llm/export/extract_salm_weights.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import os
+import tempfile
+
+import torch
+from megatron.core import dist_checkpointing
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+def get_config_and_state_dict_from_nemo(filepath, map_location, output_dir, sharded_state_dict=None):
+    cwd = os.getcwd()
+    save_restore_connector = NLPSaveRestoreConnector()
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            if os.path.isfile(filepath):
+                save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir)
+            else:
+                tmpdir = filepath
+
+            os.chdir(tmpdir)
+            config_yaml = "model_config.yaml"
+            model_weights_ckpt = "model_weights.ckpt"
+
+            # find file in tmpdir that endswith "tokenizer.model"
+            tokenizer = None
+            for file in os.listdir(tmpdir):
+                if file.endswith("tokenizer.model"):
+                    tokenizer = file
+                    break
+            if tokenizer is None:
+                raise ValueError(f"Tokenizer not found in {tmpdir}")
+            tokenizer_path = os.path.join(tmpdir, tokenizer)
+            # copy tokenizer_path to current directory
+            os.system(f"cp {tokenizer_path} {output_dir}")
+            tokenizer_path = os.path.join(output_dir, tokenizer)
+
+            # load conf
+            with open(config_yaml) as f:
+                conf = OmegaConf.load(f)
+
+            os.chdir(cwd)
+            model_weights = os.path.join(tmpdir, model_weights_ckpt)
+            model_weights = inject_model_parallel_rank(model_weights)
+            state_dict = save_restore_connector._load_state_dict_from_disk(model_weights, map_location=map_location)
+
+            # distributed checkpointing
+            if state_dict is None and sharded_state_dict is not None:
+                checkpoint = dict(state_dict=sharded_state_dict)
+                tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
+                tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
+                assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
+                checkpoint = dist_checkpointing.load(
+                    sharded_state_dict=checkpoint,
+                    checkpoint_dir=tmp_model_weights_dir,
+                )
+                state_dict = checkpoint["state_dict"]
+
+            conf.tokenizer.model = tokenizer_path
+            return conf, state_dict
+        finally:
+            os.chdir(cwd)
+
+
+def get_llm_model_state_dict(state_dict, lora_model_state_dict):
+    llm_model_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith("model."):
+            if key not in lora_model_state_dict and value != None:
+                llm_model_state_dict[key] = value
+    return llm_model_state_dict
+
+
+def get_lora_state_dict(state_dict):
+    lora_model_state_dict = {}
+    for key, value in state_dict.items():
+        if "adapter_layer.lora" in key and value != None:
+            lora_model_state_dict[key] = value
+    return lora_model_state_dict
+
+
+def get_perception_state_dict(state_dict):
+    perception_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith("perception."):
+            key = key.replace("perception.", "", 1)
+            perception_state_dict[key] = value
+    return perception_state_dict
+
+
+def save_llm_model(state_dict, nemo_config, output_path):
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(state_dict.keys())
+        for key in keys:
+            state_dict[key.replace('model.', 'model.module.', 1)] = state_dict['state_dict'].pop(key)
+
+    trainer = Trainer(accelerator='cpu', strategy=NLPDDPStrategy())
+    model = load_state_dict_helper(MegatronGPTModel, nemo_config, trainer, state_dict)
+    model._save_restore_connector = NLPSaveRestoreConnector()
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(output_path)
+    logging.info(f'llm model saved to: {output_path}')
+
+
+def save_nemo_weights(state_dict, output_dir, config, save_nemo_model=True):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    weight_file = os.path.join(output_dir, "model_weights.ckpt")
+    torch.save(state_dict, weight_file)
+    # convert config to yaml
+    config_file = os.path.join(output_dir, "model_config.yaml")
+    with open(config_file, "w") as f:
+        f.write(OmegaConf.to_yaml(config))
+
+    if save_nemo_model:
+        # create nemo file
+        nemo_model_name = f"{output_dir}.nemo"
+        nemo_path = os.path.join(output_dir, nemo_model_name)
+        # tar model_config.yaml and model_weights.ckpt
+        os.system(f"tar -C {output_dir} -cvf {nemo_path} model_config.yaml model_weights.ckpt")
+        # remove model_config.yaml and model_weights.ckpt
+        os.system(f"rm {config_file} {weight_file}")
+        # remove the empty directory
+        os.system(f"rmdir {output_dir}")
+
+
+def separate_speechllm_model(model_file_path, output_dir, map_location="cuda:0"):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    output_dir = os.path.abspath(output_dir)
+
+    logging.info(f"Separating {model_file_path} into perception, lora, and llm model")
+    filepath = model_file_path
+    conf, state_dict = get_config_and_state_dict_from_nemo(filepath, map_location, output_dir)
+
+    base_model_name = os.path.basename(filepath).split(".")[0]
+
+    perception_state_dict = get_perception_state_dict(state_dict)
+    perception_model_dir = None
+    if perception_state_dict:
+        perception_model_dir = f"{base_model_name}_perception"
+        perception_model_dir = os.path.join(output_dir, perception_model_dir)
+        save_nemo_weights(perception_state_dict, perception_model_dir, conf.perception, save_nemo_model=False)
+
+        # verify if the exported perception model is correct
+        perception = AudioPerceptionModule(cfg=conf.perception)
+        perception.load_state_dict(perception_state_dict)
+        perception.eval()
+        print(perception)
+        print(perception(input_signal=torch.randn(1, 1000), input_signal_length=torch.tensor([1000])))
+    # absolute path of perception model
+    logging.info(f"Perception model saved to:  {perception_model_dir}")
+
+    lora_model_weights = get_lora_state_dict(state_dict)
+    lora_model_dir = None
+    if lora_model_weights:
+        lora_model_dir = f"{base_model_name}_lora"
+        lora_model_dir = os.path.join(output_dir, lora_model_dir)
+        save_nemo_weights(lora_model_weights, lora_model_dir, conf)
+        logging.info(f"Lora model saved to: {lora_model_dir}.nemo")
+    # hard code the target model for now
+    llm_model_weights = get_llm_model_state_dict(state_dict, lora_model_weights)
+    if llm_model_weights:
+        llm_model = f"{base_model_name}_llm.nemo"
+        llm_model = os.path.join(output_dir, llm_model)
+        conf.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel"
+        save_llm_model(llm_model_weights, conf, llm_model)
+        logging.info(f"LLM model saved to: {llm_model}")
+
+
+# filepath = "/ws/speechllm_fc_llama2_7b.nemo"
+# output_dir = "/ws/speechllm_fc_llama2_7b_separated"
+# perception_model_dir, lora_model, llm_model = separate_speechllm_model(filepath, output_dir)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Separate speechllm model')
+    parser.add_argument('--model_file_path', type=str, help='Path to the speechllm model')
+    parser.add_argument('--output_dir', type=str, help='Output directory to save the separated models')
+    args = parser.parse_args()
+    separate_speechllm_model(args.model_file_path, args.output_dir)
diff --git a/nemo/deploy/multimodal/query_multimodal.py b/nemo/deploy/multimodal/query_multimodal.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import soundfile as sf
 from PIL import Image
 
 from nemo.deploy.utils import str_list2numpy
@@ -71,6 +72,11 @@ def setup_media(self, input_media):
         elif self.model_type == "neva" or self.model_type == "vila":
             media = Image.open(input_media).convert('RGB')
             return np.expand_dims(np.array(media), axis=0)
+        elif self.model_type == "salm":
+            waveform, sample_rate = sf.read(input_media, dtype=np.float32)
+            input_signal = np.array([waveform], dtype=np.float32)
+            input_signal_length = np.array([[len(waveform)]], dtype=np.int32)
+            return {"input_signal": input_signal, "input_signal_length": input_signal_length}
         else:
             raise RuntimeError(f"Invalid model type {self.model_type}")
 
@@ -105,8 +111,10 @@ def query(
         inputs = {"input_text": prompts}
 
         media = self.setup_media(input_media)
-
-        inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0)
+        if isinstance(media, dict):
+            inputs.update(media)
+        else:
+            inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0)
 
         if batch_size is not None:
             inputs["batch_size"] = np.full(prompts.shape, batch_size, dtype=np.int_)