NVIDIA · janekl · Aug 29, 2024 · Aug 14, 2024 · Aug 14, 2024 · Aug 14, 2024
diff --git a/nemo/export/tarutils.py b/nemo/export/tarutils.py
@@ -20,7 +20,7 @@
 import zarr.storage
 
 
-class TarPath:
+class TarPath(os.PathLike):
     """
     A class that represents a path inside a TAR archive and behaves like pathlib.Path.
 
@@ -58,6 +58,9 @@ def __truediv__(self, key) -> 'TarPath':
     def __str__(self) -> str:
         return os.path.join(self._tar.name, self._relpath)
 
+    def __fspath__(self):
+        return str(self)
+
     @property
     def tarobject(self):
         return self._tar

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
@@ -167,6 +167,8 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
+        fp8_quantized: bool = False,
+        fp8_kvcache: bool = False,
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
@@ -202,6 +204,8 @@ def export(
             multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
             gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
             gemm_plugin (str): enable the gpt plugin. Default = "auto"
+            fp8_quantized (bool): enables exporting to FP8 TRT-LLM checkpoints
+            fp8_kvcache (bool): enables FP8 KV-cache quantization
         """
 
         if n_gpus is not None:
@@ -324,6 +328,8 @@ def export(
                     gpus_per_node=gpus_per_node,
                     use_parallel_embedding=use_parallel_embedding,
                     use_embedding_sharing=use_embedding_sharing,
+                    fp8_quantized=fp8_quantized,
+                    fp8_kvcache=fp8_kvcache,
                 )
 
                 for weight_dict, model_config in zip(weights_dicts, model_configs):

diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
@@ -15,10 +15,11 @@
 
 import csv
 import logging
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import tensorrt_llm
+import torch
 from tensorrt_llm._utils import pad_vocab_size
 from tensorrt_llm.functional import non_gated_version
 from tensorrt_llm.layers import MoeConfig
@@ -38,10 +39,11 @@
 def get_config(decoder_type, config):
     if decoder_type == "llama":
         return LLaMAConfig(**config)
-    elif decoder_type == "gpt" or decoder_type == "gptnext":
+
+    if decoder_type in ["gpt", "gptnext"]:
         return GPTConfig(**config)
-    else:
-        return PretrainedConfig(**config)
+
+    return PretrainedConfig(**config)
 
 
 def prompt_convert(prompt_config, prompt_weights):
@@ -78,6 +80,21 @@ def prompt_convert(prompt_config, prompt_weights):
     return vtokens_embeddings
 
 
+def create_common_export_config(nemo_model_config, decoder_type, fp8_quantized=False, fp8_kvcache=False):
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+    return {
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
+        "split_gated_activation": nemo_model_config.get("activation", "gelu")
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu"]
+        and (decoder_type == "gptnext" or is_mcore),
+        "num_attention_heads": nemo_model_config["num_attention_heads"],
+        "use_attention_nemo_shape": True,
+        "transpose_weights": True,
+        "fp8_quantized": fp8_quantized,
+        "fp8_kvcache": fp8_kvcache,
+    }
+
+
 def model_to_trtllm_ckpt(
     model,
     nemo_model_config,
@@ -91,15 +108,17 @@ def model_to_trtllm_ckpt(
     use_embedding_sharing: bool = False,
     use_distributed_convert: bool = False,
     model_parallel_rank: int = None,
-    vocab_size: int = None,
+    vocab_size: Optional[int] = None,
+    fp8_quantized: bool = False,
+    fp8_kvcache: bool = False,
 ) -> Tuple[List[Dict], List[PretrainedConfig]]:
-
     if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing:
         LOGGER.info(
             "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True"
         )
         use_embedding_sharing = True
 
+    export_config = create_common_export_config(nemo_model_config, decoder_type, fp8_quantized, fp8_kvcache)
     # If the model has been sharded with model parallelism, convert the model in a gpu-distributed manner
     if use_distributed_convert:
         weights_dict = dist_model_to_trt_llm_ckpt(
@@ -108,9 +127,12 @@ def model_to_trtllm_ckpt(
             inference_tp_size=tensor_parallel_size,
             inference_pp_size=pipeline_parallel_size,
             tokenizer_vocab_size=vocab_size,
+            export_config=export_config,
         )
         vocab_size_padded = vocab_size
     else:
+        vocab_embedding_key = "transformer.vocab_embedding.weight"
+
         weights_dict = convert_model_to_trt_llm_ckpt(
             model=model,
             nemo_model_config=nemo_model_config,
@@ -119,19 +141,23 @@ def model_to_trtllm_ckpt(
             processes=1,
             storage_type=dtype,
             use_parallel_embedding=use_parallel_embedding,
-            decoder_type=decoder_type,
+            export_config=export_config,
         )
 
+        if vocab_size is None:
+            vocab_size = weights_dict[vocab_embedding_key].shape[0]
+
         has_lm_head = "lm_head.weight" in weights_dict
+        vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size
+        padding = (0, 0, 0, vocab_size_padded - vocab_size)
         if has_lm_head:
             lm_head_weight = weights_dict["lm_head.weight"]
-        if vocab_size is None:
-            vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0]
-        vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size
+            lm_head_weight = torch.nn.functional.pad(lm_head_weight, padding, "constant", 0)
 
-        if has_lm_head and vocab_size_padded != vocab_size:
-            pad_width = vocab_size_padded - vocab_size
-            lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
+        if vocab_embedding_key in weights_dict:
+            weights_dict[vocab_embedding_key] = torch.nn.functional.pad(
+                weights_dict[vocab_embedding_key], padding, "constant", 0
+            )
 
     world_size = tensor_parallel_size * pipeline_parallel_size
     hidden_act = nemo_model_config.get('activation')
@@ -159,8 +185,8 @@ def model_to_trtllm_ckpt(
         'embedding_sharding_dim': 0,
         'share_embedding_table': use_embedding_sharing,
         'quantization': {
-            'quant_algo': None,
-            'kv_cache_quant_algo': None,
+            'quant_algo': "FP8" if fp8_quantized else None,
+            'kv_cache_quant_algo': "FP8" if fp8_kvcache else None,
         },
         'bias': nemo_model_config.get('bias'),
         'apply_query_key_layer_scaling': False,
@@ -203,7 +229,7 @@ def model_to_trtllm_ckpt(
         return weights_dicts, model_configs
 
     pp_key = {
-        "transformer.vocab_embedding.weight",
+        vocab_embedding_key,
         "transformer.position_embedding.weight",
         "lm_head.weight",
         "transformer.ln_f.weight",
@@ -228,10 +254,9 @@ def model_to_trtllm_ckpt(
                 continue
             new_key = k
             if new_key.endswith(".bin"):  # TP split
-                if new_key.endswith(f"{mapping.tp_rank}.bin"):
-                    new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
-                else:
+                if not new_key.endswith(f"{mapping.tp_rank}.bin"):
                     continue
+                new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
             if "layers" in new_key:  # PP
                 layer_num = int(new_key.split(".")[2])
                 if layer_num in layers_range:
@@ -242,14 +267,12 @@ def model_to_trtllm_ckpt(
 
         if mapping.is_first_pp_rank():
             embedding_weight = (
-                np.ascontiguousarray(
-                    split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
-                )
+                np.ascontiguousarray(split(weights_dict[vocab_embedding_key], mapping.tp_size, mapping.tp_rank))
                 if use_parallel_embedding
-                else weights_dict["transformer.vocab_embedding.weight"]
+                else weights_dict[vocab_embedding_key]
             )
 
-            weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight
+            weights_dict_local[vocab_embedding_key] = embedding_weight
 
             pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight")
             if pos_embedding_weight is not None:
@@ -261,9 +284,9 @@ def model_to_trtllm_ckpt(
 
         if mapping.is_last_pp_rank():
             if has_lm_head:
-                weights_dict_local["lm_head.weight"] = np.ascontiguousarray(
-                    split(lm_head_weight, mapping.tp_size, mapping.tp_rank)
-                )
+                weights_dict_local["lm_head.weight"] = split(
+                    lm_head_weight, mapping.tp_size, mapping.tp_rank
+                ).contiguous()
             weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"]
 
             ln_f_bias = weights_dict.get("transformer.ln_f.bias")