Skip to content

Commit

Permalink
salm export trtllm (#10245)
Browse files Browse the repository at this point in the history
Signed-off-by: slyne deng <slyned@nvidia.com>
Co-authored-by: slyne deng <slyned@nvidia.com>
  • Loading branch information
2 people authored and hemildesai committed Aug 28, 2024
1 parent ecb1813 commit 2972e37
Show file tree
Hide file tree
Showing 9 changed files with 810 additions and 28 deletions.
83 changes: 83 additions & 0 deletions examples/multimodal/speech_llm/export/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
## Setup
In this part, we are going to export SALM model into TRTLLM.
First, let's download the [SALM nemo model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speechllm_fc_llama2_7b/) from NVIDIA ngc.

```bash
wget --content-disposition 'https://api.ngc.nvidia.com/v2/models/org/nvidia/team/nemo/speechllm_fc_llama2_7b/1.23.1/files?redirect=true&path=speechllm_fc_llama2_7b.nemo' -O speechllm_fc_llama2_7b.nemo
```

Then, we need to extract the different parts of SALM.
```bash
output=$PWD/output
python3 extract_salm_weights.py --model_file_path=speechllm_fc_llama2_7b.nemo --output_dir=$output
```
It takes a while to run the above command.

Under the `output` dir, you'll see:
```
output
|___speechllm_fc_llama2_7b_lora.nemo
|___speechllm_fc_llama2_7b_perception
| |____model_config.yaml
| |____model_weights.ckpt
|___speechllm_fc_llama2_7b_llm.nemo
|___ xxx.tokenizer.model
```

After we get the lora nemo model and llm nemo model, we can merge the lora part into the llm by:
```bash
python /opt/NeMo/scripts/nlp_language_modeling/merge_lora_weights/merge.py \
trainer.accelerator=gpu \
tensor_model_parallel_size=1 \
pipeline_model_parallel_size=1 \
gpt_model_file=output/speechllm_fc_llama2_7b_llm.nemo \
lora_model_path=output/speechllm_fc_llama2_7b_lora.nemo \
merged_model_path=speechllm_fc_llama2_7b_llm_merged.nemo
```

Now we are able to export the engine by:
```bash
python3 export_salm.py \
model.perception_model_path=output/speechllm_fc_llama2_7b_perception \
model.llm_model_path=output/speechllm_fc_llama2_7b_llm_merged.nemo
```

You should be able to get the generated engines under `./salm` folder. To run the engines, you may run:
```python
from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter

output_dir = "/ws/salm" # the engine directory
trt_llm_exporter = TensorRTMMExporter(model_dir=output_dir, load_model=True, modality='audio')
input_text = "Q: what's the transcription of the audio? A:"
input_media = '/ws/data/test_audio.wav'
print(trt_llm_exporter.forward(input_text, input_media))

```

## Deploy
If you want to generate the engines and deploy them with Triton Inference Server, you may also run:

```bash
python3 NeMo/scripts/deploy/multimodal/deploy_triton.py \
--modality="audio" \
--visual_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_perception \
--llm_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_llm_merged.nemo \
--llm_model_type="llama" \
--model_type="salm" \
--triton_model_name="salm" \
--max_input_len=4096 \
--max_output_len=256 \
--max_multimodal_len=3072 \
--triton_model_repository=/tmp/trt_model_dir/
```

And on client side, you may run:
```bash
python3 NeMo/scripts/deploy/multimodal/query.py \
--model_name="salm" \
--model_type="salm" \
--input_text="Q: what's the transcription of the audio? A:" \
--input_media=/ws/data/test_audio.wav
```

For more details, please check `NeMo/scripts/deploy/multimodal/deploy_triton.py` and ` NeMo/scripts/deploy/multimodal/query.py`.
16 changes: 16 additions & 0 deletions examples/multimodal/speech_llm/export/conf/salm_export.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: speechllm_salm
infer:
output_dir: ./salm
max_batch_size: 1
tensor_parallelism: 1
max_input_len: 4096
max_output_len: 256
max_multimodal_len: 3072
perception_max_batch_size: 1

model:
type: salm
precision: float16
perception_model_path: /path/to/speechllm_llama2_7b_perception
llm_model_path: /path/to/speechllm_llama2_7b_llm.nemo
llm_model_type: llama
39 changes: 39 additions & 0 deletions examples/multimodal/speech_llm/export/export_salm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo.core.config import hydra_runner
from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter


@hydra_runner(config_path='conf', config_name='salm_export')
def main(cfg):
exporter = TensorRTMMExporter(model_dir=cfg.infer.output_dir, load_model=False, modality='audio')
exporter.export(
visual_checkpoint_path=cfg.model.perception_model_path,
llm_checkpoint_path=cfg.model.llm_model_path,
model_type=cfg.model.type,
llm_model_type=cfg.model.llm_model_type,
tensor_parallel_size=cfg.infer.tensor_parallelism,
max_input_len=cfg.infer.max_input_len,
max_output_len=cfg.infer.max_output_len,
vision_max_batch_size=cfg.infer.perception_max_batch_size,
max_batch_size=cfg.infer.max_batch_size,
max_multimodal_len=cfg.infer.max_multimodal_len,
dtype=cfg.model.precision,
load_model=False,
)


if __name__ == '__main__':
main()
204 changes: 204 additions & 0 deletions examples/multimodal/speech_llm/export/extract_salm_weights.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse
import os
import tempfile

import torch
from megatron.core import dist_checkpointing
from omegaconf import OmegaConf
from pytorch_lightning.trainer.trainer import Trainer

from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper
from nemo.utils import logging
from nemo.utils.model_utils import inject_model_parallel_rank


def get_config_and_state_dict_from_nemo(filepath, map_location, output_dir, sharded_state_dict=None):
cwd = os.getcwd()
save_restore_connector = NLPSaveRestoreConnector()

with tempfile.TemporaryDirectory() as tmpdir:
try:
if os.path.isfile(filepath):
save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir)
else:
tmpdir = filepath

os.chdir(tmpdir)
config_yaml = "model_config.yaml"
model_weights_ckpt = "model_weights.ckpt"

# find file in tmpdir that endswith "tokenizer.model"
tokenizer = None
for file in os.listdir(tmpdir):
if file.endswith("tokenizer.model"):
tokenizer = file
break
if tokenizer is None:
raise ValueError(f"Tokenizer not found in {tmpdir}")
tokenizer_path = os.path.join(tmpdir, tokenizer)
# copy tokenizer_path to current directory
os.system(f"cp {tokenizer_path} {output_dir}")
tokenizer_path = os.path.join(output_dir, tokenizer)

# load conf
with open(config_yaml) as f:
conf = OmegaConf.load(f)

os.chdir(cwd)
model_weights = os.path.join(tmpdir, model_weights_ckpt)
model_weights = inject_model_parallel_rank(model_weights)
state_dict = save_restore_connector._load_state_dict_from_disk(model_weights, map_location=map_location)

# distributed checkpointing
if state_dict is None and sharded_state_dict is not None:
checkpoint = dict(state_dict=sharded_state_dict)
tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
checkpoint = dist_checkpointing.load(
sharded_state_dict=checkpoint,
checkpoint_dir=tmp_model_weights_dir,
)
state_dict = checkpoint["state_dict"]

conf.tokenizer.model = tokenizer_path
return conf, state_dict
finally:
os.chdir(cwd)


def get_llm_model_state_dict(state_dict, lora_model_state_dict):
llm_model_state_dict = {}
for key, value in state_dict.items():
if key.startswith("model."):
if key not in lora_model_state_dict and value != None:
llm_model_state_dict[key] = value
return llm_model_state_dict


def get_lora_state_dict(state_dict):
lora_model_state_dict = {}
for key, value in state_dict.items():
if "adapter_layer.lora" in key and value != None:
lora_model_state_dict[key] = value
return lora_model_state_dict


def get_perception_state_dict(state_dict):
perception_state_dict = {}
for key, value in state_dict.items():
if key.startswith("perception."):
key = key.replace("perception.", "", 1)
perception_state_dict[key] = value
return perception_state_dict


def save_llm_model(state_dict, nemo_config, output_path):
if nemo_config.get('megatron_amp_O2', False):
keys = list(state_dict.keys())
for key in keys:
state_dict[key.replace('model.', 'model.module.', 1)] = state_dict['state_dict'].pop(key)

trainer = Trainer(accelerator='cpu', strategy=NLPDDPStrategy())
model = load_state_dict_helper(MegatronGPTModel, nemo_config, trainer, state_dict)
model._save_restore_connector = NLPSaveRestoreConnector()
model.cfg.use_cpu_initialization = False

model.save_to(output_path)
logging.info(f'llm model saved to: {output_path}')


def save_nemo_weights(state_dict, output_dir, config, save_nemo_model=True):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
weight_file = os.path.join(output_dir, "model_weights.ckpt")
torch.save(state_dict, weight_file)
# convert config to yaml
config_file = os.path.join(output_dir, "model_config.yaml")
with open(config_file, "w") as f:
f.write(OmegaConf.to_yaml(config))

if save_nemo_model:
# create nemo file
nemo_model_name = f"{output_dir}.nemo"
nemo_path = os.path.join(output_dir, nemo_model_name)
# tar model_config.yaml and model_weights.ckpt
os.system(f"tar -C {output_dir} -cvf {nemo_path} model_config.yaml model_weights.ckpt")
# remove model_config.yaml and model_weights.ckpt
os.system(f"rm {config_file} {weight_file}")
# remove the empty directory
os.system(f"rmdir {output_dir}")


def separate_speechllm_model(model_file_path, output_dir, map_location="cuda:0"):
if not os.path.exists(output_dir):
os.mkdir(output_dir)
output_dir = os.path.abspath(output_dir)

logging.info(f"Separating {model_file_path} into perception, lora, and llm model")
filepath = model_file_path
conf, state_dict = get_config_and_state_dict_from_nemo(filepath, map_location, output_dir)

base_model_name = os.path.basename(filepath).split(".")[0]

perception_state_dict = get_perception_state_dict(state_dict)
perception_model_dir = None
if perception_state_dict:
perception_model_dir = f"{base_model_name}_perception"
perception_model_dir = os.path.join(output_dir, perception_model_dir)
save_nemo_weights(perception_state_dict, perception_model_dir, conf.perception, save_nemo_model=False)

# verify if the exported perception model is correct
perception = AudioPerceptionModule(cfg=conf.perception)
perception.load_state_dict(perception_state_dict)
perception.eval()
print(perception)
print(perception(input_signal=torch.randn(1, 1000), input_signal_length=torch.tensor([1000])))
# absolute path of perception model
logging.info(f"Perception model saved to: {perception_model_dir}")

lora_model_weights = get_lora_state_dict(state_dict)
lora_model_dir = None
if lora_model_weights:
lora_model_dir = f"{base_model_name}_lora"
lora_model_dir = os.path.join(output_dir, lora_model_dir)
save_nemo_weights(lora_model_weights, lora_model_dir, conf)
logging.info(f"Lora model saved to: {lora_model_dir}.nemo")
# hard code the target model for now
llm_model_weights = get_llm_model_state_dict(state_dict, lora_model_weights)
if llm_model_weights:
llm_model = f"{base_model_name}_llm.nemo"
llm_model = os.path.join(output_dir, llm_model)
conf.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel"
save_llm_model(llm_model_weights, conf, llm_model)
logging.info(f"LLM model saved to: {llm_model}")


# filepath = "/ws/speechllm_fc_llama2_7b.nemo"
# output_dir = "/ws/speechllm_fc_llama2_7b_separated"
# perception_model_dir, lora_model, llm_model = separate_speechllm_model(filepath, output_dir)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Separate speechllm model')
parser.add_argument('--model_file_path', type=str, help='Path to the speechllm model')
parser.add_argument('--output_dir', type=str, help='Output directory to save the separated models')
args = parser.parse_args()
separate_speechllm_model(args.model_file_path, args.output_dir)
12 changes: 10 additions & 2 deletions nemo/deploy/multimodal/query_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

import numpy as np
import soundfile as sf
from PIL import Image

from nemo.deploy.utils import str_list2numpy
Expand Down Expand Up @@ -71,6 +72,11 @@ def setup_media(self, input_media):
elif self.model_type == "neva" or self.model_type == "vila":
media = Image.open(input_media).convert('RGB')
return np.expand_dims(np.array(media), axis=0)
elif self.model_type == "salm":
waveform, sample_rate = sf.read(input_media, dtype=np.float32)
input_signal = np.array([waveform], dtype=np.float32)
input_signal_length = np.array([[len(waveform)]], dtype=np.int32)
return {"input_signal": input_signal, "input_signal_length": input_signal_length}
else:
raise RuntimeError(f"Invalid model type {self.model_type}")

Expand Down Expand Up @@ -105,8 +111,10 @@ def query(
inputs = {"input_text": prompts}

media = self.setup_media(input_media)

inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0)
if isinstance(media, dict):
inputs.update(media)
else:
inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0)

if batch_size is not None:
inputs["batch_size"] = np.full(prompts.shape, batch_size, dtype=np.int_)
Expand Down
Loading

0 comments on commit 2972e37

Please sign in to comment.