Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrating mcore export #10238

Merged
merged 47 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
7e7eb7f
Integrating mcore export
Aug 23, 2024
d6351bb
Integrating mcore export
Aug 23, 2024
996ea05
Apply isort and black reformatting
shanmugamr1992 Aug 23, 2024
7c0584a
Move trt imports in nemo.collections.llm inside respective functions …
hemildesai Aug 23, 2024
c34d29a
Add tests for LazyNeMoIterator and fix case with metadata_only=True a…
pzelasko Aug 23, 2024
3aa1e5c
[NeMo-UX] Fix a serialization bug that prevents users from moving che…
ashors1 Aug 23, 2024
57de288
Add MemoryProfileCallback (#10166)
ShriyaPalsamudram Aug 23, 2024
9214a4e
Lower bound transformers to support nemotron (#10240)
thomasdhc Aug 23, 2024
c690c4f
[Audio] SSL Pretraining framework for flow-matching model for audio p…
Kuray107 Aug 24, 2024
04ca831
Revert torchrun fix for model import (#10251)
akoumpa Aug 26, 2024
7a8c0e8
[NeMo-UX[ Move nemotron imports inline (#10255)
marcromeyn Aug 26, 2024
ac5cb06
Wrap CPU model init with megatron_lazy_init_context (#10219)
akoumpa Aug 26, 2024
076f9ea
Bump `Dockerfile.ci` (2024-08-22) (#10227)
ko3n1g Aug 26, 2024
5964387
salm export trtllm (#10245)
Slyne Aug 26, 2024
8524596
[🤠]: Howdy folks, let's bump `Dockerfile.ci` to ef85bc9 ! (#10250)
ko3n1g Aug 27, 2024
f1f145a
[🤠]: Howdy folks, let's bump `Dockerfile.ci` to 01ca03f ! (#10266)
ko3n1g Aug 27, 2024
0d1e460
Load model in the target export precision by default in PTQ (#10267)
janekl Aug 27, 2024
f131db2
Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.ru…
hemildesai Aug 27, 2024
86dcd99
[NeMo-UX] Handle absolute logger directories in nemo_logger (#10259)
ashors1 Aug 27, 2024
97ce34a
Add sdxl notebook (#10139)
Victor49152 Aug 27, 2024
c52a0a4
Updating some coments
Aug 27, 2024
ed26d89
Apply isort and black reformatting
shanmugamr1992 Aug 27, 2024
046a6ed
Merge branch 'main' into integrate_mcore_export
ko3n1g Aug 27, 2024
e3c5283
Updating some coments
Aug 27, 2024
1b07bd1
Apply isort and black reformatting
shanmugamr1992 Aug 27, 2024
3c1e2c1
Updating some coments
Aug 27, 2024
0c13c83
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Aug 28, 2024
25b0e95
Small change
Sep 16, 2024
f70c1da
Apply isort and black reformatting
shanmugamr1992 Sep 16, 2024
57bb895
Rebase and integrate latest mcore changes
Sep 23, 2024
822ec5b
Apply isort and black reformatting
shanmugamr1992 Sep 23, 2024
8600d31
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Sep 23, 2024
a691e55
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Sep 24, 2024
e05fe2c
ADD support for layernorm1p
Sep 25, 2024
28a0eb5
Apply isort and black reformatting
shanmugamr1992 Sep 25, 2024
6d20aed
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Sep 26, 2024
aaa4a09
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Sep 26, 2024
370945e
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Sep 27, 2024
68c635e
Update Dockerfile.ci
shanmugamr1992 Sep 27, 2024
15bc02f
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Sep 27, 2024
b7d40da
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Sep 30, 2024
7ee86bf
Update Dockerfile.ci
shanmugamr1992 Oct 1, 2024
56654b0
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Oct 1, 2024
f945350
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Oct 2, 2024
e12bf8f
Merge branch 'main' into integrate_mcore_export
shanmugamr1992 Oct 3, 2024
1d3a5ad
Update Dockerfile.ci
shanmugamr1992 Oct 3, 2024
cf7db10
Merge branch 'main' into integrate_mcore_export
Oct 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 145 additions & 42 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,6 @@
gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
gemm_plugin (str): enable the gpt plugin. Default = "auto"
"""

if n_gpus is not None:
warnings.warn(
"Parameter n_gpus is deprecated and will be removed in the next release. "
Expand Down Expand Up @@ -306,51 +305,156 @@
"Supported model types are: {1}.".format(model_type, self.get_supported_models_list)
)

if model_type == "gpt" or model_type == "starcoder":
model_type = "gptnext"
model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
USE_NEW_CODE = True

if USE_NEW_CODE:
from megatron.core.export.data_type import DataType
from megatron.core.export.export_config import ExportConfig
from megatron.core.export.model_config import ModelConfig
from megatron.core.export.model_type import ModelType
from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
DEFAULT_CONVERSION_DICT,
)
from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
from tensorrt_llm.layers import MoeConfig

if model_type == "mixtral":
model_type = "llama"
def get_model_config(nemo_model_config):
conf = ModelConfig()
conf.share_embeddings_and_output_weights = nemo_model_config.get(
"share_embeddings_and_output_weights", False
)
conf.activation = nemo_model_config.get('activation')
conf.nemo_model_config = nemo_model_config.get('num_moe_experts', 0)
conf.num_layers = nemo_model_config.get('num_layers')
conf.moe_router_topk = nemo_model_config.get('moe_router_topk', 0)
conf.num_attention_heads = nemo_model_config.get('num_attention_heads')
conf.num_query_groups = nemo_model_config.get(
'num_query_groups', nemo_model_config['num_attention_heads']
)
conf.kv_channels = nemo_model_config.get("kv_channels", None)
conf.hidden_size = nemo_model_config.get('hidden_size')
conf.ffn_hidden_size = nemo_model_config.get('ffn_hidden_size')
conf.layernorm_epsilon = nemo_model_config.get('layernorm_epsilon')
conf.position_embedding_type = nemo_model_config.get('position_embedding_type')
conf.max_position_embeddings = nemo_model_config.get('max_position_embeddings')
conf.bias = nemo_model_config.get('bias')
conf.rotary_percentage = nemo_model_config.get('rotary_percentage', 1.0)
conf.rotary_base = nemo_model_config.get('rotary_base', 10000)
conf.num_moe_experts = nemo_model_config.get('num_moe_experts', 0)
conf.moe_renorm_model = nemo_model_config.get(
'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
)
conf.moe_tp_mode = nemo_model_config.get('moe_tp_mode', 2)
conf.seq_len_interpolation_factor = nemo_model_config.get("seq_len_interpolation_factor")
conf.mcore_gpt = nemo_model_config.get("mcore_gpt", False)
conf.share_embeddings_and_output_weights = nemo_model_config.get(
"share_embeddings_and_output_weights", False
)
conf.apply_embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
conf.multi_query_mode = nemo_model_config.get("multi_query_mode", False)
conf.normalization = nemo_model_config.get("normalization", "")
conf.precision = nemo_model_config.get("precision")
return conf

input_model_config = get_model_config(model_configs)
input_model_type = getattr(ModelType, model_type)
mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
nemo_model_conversion_dict = {
f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
}
trtllm_helper = TRTLLMHelper(
input_model_config, input_model_type, trtllm_conversion_dict=nemo_model_conversion_dict
)

model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
weights_dicts, model_configs = model_to_trtllm_ckpt(
model=model,
nemo_model_config=model_configs,
nemo_export_dir=nemo_export_dir,
decoder_type=model_type,
dtype=dtype,
tensor_parallel_size=tensor_parallelism_size,
pipeline_parallel_size=pipeline_parallelism_size,
gpus_per_node=gpus_per_node,
use_parallel_embedding=use_parallel_embedding,
use_embedding_sharing=use_embedding_sharing,
)
input_dtype = getattr(DataType, dtype)
export_config = ExportConfig(
tensor_parallelism_size,
pipeline_parallelism_size,
use_parallel_embedding,
use_embedding_sharing,
gpus_per_node,
)

for weight_dict, model_config in zip(weights_dicts, model_configs):
build_and_save_engine(
max_input_len=max_input_len,
max_output_len=max_output_len,
max_batch_size=max_batch_size,
model_config=model_config,
model_weights=weight_dict,
model_dir=self.model_dir,
model_type=model_type,
lora_ckpt_list=self.lora_ckpt_list,
use_lora_plugin=use_lora_plugin,
max_lora_rank=max_lora_rank,
lora_target_modules=lora_target_modules,
max_prompt_embedding_table_size=max_prompt_embedding_table_size,
paged_kv_cache=paged_kv_cache,
remove_input_padding=remove_input_padding,
paged_context_fmha=paged_context_fmha,
max_num_tokens=max_num_tokens,
opt_num_tokens=opt_num_tokens,
max_seq_len=max_seq_len,
multiple_profiles=multiple_profiles,
gpt_attention_plugin=gpt_attention_plugin,
gemm_plugin=gemm_plugin,
trtllm_model_weights_list, trtllm_model_config_list = (
trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
model_state_dict=model, export_config=export_config, dtype=input_dtype
)
)

for trtllm_model_weights, trtllm_model_config in zip(
trtllm_model_weights_list, trtllm_model_config_list
):
trtllm_helper.build_and_save_engine(
max_input_len=max_input_len,
max_output_len=max_output_len,
max_batch_size=max_batch_size,
engine_dir=self.model_dir,
trtllm_model_weights=trtllm_model_weights,
trtllm_model_config=trtllm_model_config,
lora_ckpt_list=self.lora_ckpt_list,
use_lora_plugin=use_lora_plugin,
max_lora_rank=max_lora_rank,
lora_target_modules=lora_target_modules,
max_prompt_embedding_table_size=max_prompt_embedding_table_size,
enable_multi_block_mode=False,
paged_kv_cache=paged_kv_cache,
remove_input_padding=remove_input_padding,
paged_context_fmha=paged_context_fmha,
use_custom_all_reduce=True,
use_refit=False,
max_num_tokens=max_num_tokens,
max_seq_len=max_seq_len,
opt_num_tokens=opt_num_tokens,
max_beam_width=1,
tokens_per_block=128,
multiple_profiles=multiple_profiles,
gpt_attention_plugin=gpt_attention_plugin,
gemm_plugin=gemm_plugin,
)
else:
if model_type == "gpt" or model_type == "starcoder":
Fixed Show fixed Hide fixed
model_type = "gptnext"

if model_type == "mixtral":
model_type = "llama"
weights_dicts, model_configs = model_to_trtllm_ckpt(
model=model,
nemo_model_config=model_configs,
nemo_export_dir='/tmp/shan',
decoder_type=model_type,
dtype=dtype,
tensor_parallel_size=tensor_parallelism_size,
pipeline_parallel_size=pipeline_parallelism_size,
gpus_per_node=gpus_per_node,
use_parallel_embedding=use_parallel_embedding,
use_embedding_sharing=use_embedding_sharing,
)
for weight_dict, model_config in zip(weights_dicts, model_configs):
build_and_save_engine(
max_input_len=max_input_len,
max_output_len=max_output_len,
max_batch_size=max_batch_size,
model_config=model_config,
model_weights=weight_dict,
model_dir=self.model_dir,
model_type=model_type,
lora_ckpt_list=self.lora_ckpt_list,
use_lora_plugin=use_lora_plugin,
max_lora_rank=max_lora_rank,
lora_target_modules=lora_target_modules,
max_prompt_embedding_table_size=max_prompt_embedding_table_size,
paged_kv_cache=paged_kv_cache,
remove_input_padding=remove_input_padding,
paged_context_fmha=paged_context_fmha,
max_num_tokens=max_num_tokens,
opt_num_tokens=opt_num_tokens,
max_seq_len=max_seq_len,
multiple_profiles=multiple_profiles,
gpt_attention_plugin=gpt_attention_plugin,
gemm_plugin=gemm_plugin,
)

tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
if os.path.exists(tokenizer_path):
shutil.copy(tokenizer_path, self.model_dir)
Expand Down Expand Up @@ -429,7 +533,6 @@
weight_dict[k] = numpy_to_torch(v)

safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))

model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))

tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
Expand Down
3 changes: 1 addition & 2 deletions nemo/export/trt_llm/converter/model_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ def get_config(decoder_type, config):
"llama": tensorrt_llm.models.llama.config.LLaMAConfig,
"gpt": tensorrt_llm.models.gpt.config.GPTConfig,
"gptnext": tensorrt_llm.models.gpt.config.GPTConfig,
"falcon": tensorrt_llm.models.falcon.config.FalconConfig,
"gemma": tensorrt_llm.models.GemmaConfig,
}
config_cls = DECODER_CONFIG[decoder_type] if decoder_type in DECODER_CONFIG else PretrainedConfig

Expand Down Expand Up @@ -181,6 +179,7 @@ def model_to_trtllm_ckpt(
'tp_size': tensor_parallel_size,
'pp_size': pipeline_parallel_size,
}

model_configs = []
weights_dicts = []
num_layers = nemo_model_config.get('num_layers')
Expand Down
Loading