From 7e7eb7f6315f3af92bfb62607d0cc91f91ce0d68 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Date: Fri, 23 Aug 2024 00:13:21 -0700
Subject: [PATCH 01/31] Integrating mcore export

---
 nemo/export/tensorrt_llm.py                   | 169 +++++++++++++-----
 .../trt_llm/converter/model_converter.py      |   3 +-
 2 files changed, 125 insertions(+), 47 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 2a89b76cc099..648a14f1b7b5 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -204,7 +204,6 @@ def export(
             gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
             gemm_plugin (str): enable the gpt plugin. Default = "auto"
         """
-
         if n_gpus is not None:
             warnings.warn(
                 "Parameter n_gpus is deprecated and will be removed in the next release. "
@@ -306,50 +305,131 @@ def export(
                         "Supported model types are: {1}.".format(model_type, self.get_supported_models_list)
                     )
 
-                if model_type == "gpt" or model_type == "starcoder":
-                    model_type = "gptnext"
-
-                if model_type == "mixtral":
-                    model_type = "llama"
-
                 model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
-                weights_dicts, model_configs = model_to_trtllm_ckpt(
-                    model=model,
-                    nemo_model_config=model_configs,
-                    nemo_export_dir=nemo_export_dir,
-                    decoder_type=model_type,
-                    dtype=dtype,
-                    tensor_parallel_size=tensor_parallelism_size,
-                    pipeline_parallel_size=pipeline_parallelism_size,
-                    gpus_per_node=gpus_per_node,
-                    use_parallel_embedding=use_parallel_embedding,
-                    use_embedding_sharing=use_embedding_sharing,
-                )
-
-                for weight_dict, model_config in zip(weights_dicts, model_configs):
-                    build_and_save_engine(
-                        max_input_len=max_input_len,
-                        max_output_len=max_output_len,
-                        max_batch_size=max_batch_size,
-                        model_config=model_config,
-                        model_weights=weight_dict,
-                        model_dir=self.model_dir,
-                        model_type=model_type,
-                        lora_ckpt_list=self.lora_ckpt_list,
-                        use_lora_plugin=use_lora_plugin,
-                        max_lora_rank=max_lora_rank,
-                        lora_target_modules=lora_target_modules,
-                        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                        paged_kv_cache=paged_kv_cache,
-                        remove_input_padding=remove_input_padding,
-                        paged_context_fmha=paged_context_fmha,
-                        max_num_tokens=max_num_tokens,
-                        opt_num_tokens=opt_num_tokens,
-                        max_seq_len=max_seq_len,
-                        multiple_profiles=multiple_profiles,
-                        gpt_attention_plugin=gpt_attention_plugin,
-                        gemm_plugin=gemm_plugin,
+                USE_NEW_CODE = True
+
+                if USE_NEW_CODE:
+                    from megatron.core.export.model_type import ModelType
+                    from megatron.core.export.model_config import ModelConfig
+                    from megatron.core.export.data_type import DataType
+                    from megatron.core.export.export_config import ExportConfig
+                    from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT
+                    from tensorrt_llm.layers import MoeConfig
+
+                    def get_model_config(nemo_model_config):
+                        conf = ModelConfig()
+                        conf.share_embeddings_and_output_weights = nemo_model_config.get("share_embeddings_and_output_weights", False)
+                        conf.activation = nemo_model_config.get('activation')
+                        conf.nemo_model_config = nemo_model_config.get('num_moe_experts', 0)
+                        conf.num_layers = nemo_model_config.get('num_layers')
+                        conf.moe_router_topk =  nemo_model_config.get('moe_router_topk', 0)
+                        conf.num_attention_heads = nemo_model_config.get('num_attention_heads')
+                        conf.num_query_groups = nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads'])
+                        conf.kv_channels = nemo_model_config.get("kv_channels", None)
+                        conf.hidden_size = nemo_model_config.get('hidden_size')
+                        conf.ffn_hidden_size = nemo_model_config.get('ffn_hidden_size')
+                        conf.layernorm_epsilon = nemo_model_config.get('layernorm_epsilon')
+                        conf.position_embedding_type =  nemo_model_config.get('position_embedding_type')
+                        conf.max_position_embeddings = nemo_model_config.get('max_position_embeddings')
+                        conf.bias = nemo_model_config.get('bias')
+                        conf.rotary_percentage = nemo_model_config.get('rotary_percentage', 1.0)
+                        conf.rotary_base = nemo_model_config.get('rotary_base', 10000)
+                        conf.num_moe_experts = nemo_model_config.get('num_moe_experts', 0)
+                        conf.moe_renorm_model = nemo_model_config.get(
+                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                        )
+                        conf.moe_tp_mode = nemo_model_config.get('moe_tp_mode', 2)
+                        conf.seq_len_interpolation_factor = nemo_model_config.get("seq_len_interpolation_factor")
+                        conf.mcore_gpt = nemo_model_config.get("mcore_gpt", False)
+                        conf.share_embeddings_and_output_weights = nemo_model_config.get("share_embeddings_and_output_weights", False)
+                        conf.apply_embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
+                        conf.multi_query_mode = nemo_model_config.get("multi_query_mode", False)
+                        conf.normalization = nemo_model_config.get("normalization", "")
+                        conf.precision = nemo_model_config.get("precision")
+                        return conf
+                    
+                    input_model_config = get_model_config(model_configs)       
+                    input_model_type = getattr(ModelType, model_type)
+                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
+                    nemo_model_conversion_dict = {f'model.{key}':value for key, value in mcore_model_conversion_dict.items()}
+                    trtllm_helper = TRTLLMHelper(input_model_config, input_model_type, trtllm_conversion_dict = nemo_model_conversion_dict)
+
+                    input_dtype = getattr(DataType, dtype)
+                    export_config = ExportConfig(tensor_parallelism_size, pipeline_parallelism_size, use_parallel_embedding, use_embedding_sharing, gpus_per_node)
+                   
+                    trtllm_model_weights_list, trtllm_model_config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(model_state_dict = model, export_config = export_config, dtype = input_dtype)
+
+                    for trtllm_model_weights, trtllm_model_config in zip(trtllm_model_weights_list, trtllm_model_config_list):
+                        trtllm_helper.build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            engine_dir=self.model_dir,
+                            trtllm_model_weights=trtllm_model_weights,
+                            trtllm_model_config=trtllm_model_config,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            enable_multi_block_mode=False,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            use_custom_all_reduce=True,
+                            use_refit=False,
+                            max_num_tokens=max_num_tokens,
+                            max_seq_len=max_seq_len,
+                            opt_num_tokens=opt_num_tokens,
+                            max_beam_width=1,
+                            tokens_per_block=128,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+                else : 
+                    if model_type == "gpt" or model_type == "starcoder":
+                        model_type = "gptnext"
+
+                    if model_type == "mixtral":
+                        model_type = "llama"                   
+                    weights_dicts, model_configs = model_to_trtllm_ckpt(
+                        model=model,
+                        nemo_model_config=model_configs,
+                        nemo_export_dir='/tmp/shan',
+                        decoder_type=model_type,
+                        dtype=dtype,
+                        tensor_parallel_size=tensor_parallelism_size,
+                        pipeline_parallel_size=pipeline_parallelism_size,
+                        gpus_per_node=gpus_per_node,
+                        use_parallel_embedding=use_parallel_embedding,
+                        use_embedding_sharing=use_embedding_sharing,
                     )
+                    for weight_dict, model_config in zip(weights_dicts, model_configs):
+                        build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            model_config=model_config,
+                            model_weights=weight_dict,
+                            model_dir=self.model_dir,
+                            model_type=model_type,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            max_num_tokens=max_num_tokens,
+                            opt_num_tokens=opt_num_tokens,
+                            max_seq_len=max_seq_len,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
             if os.path.exists(tokenizer_path):
@@ -429,7 +509,6 @@ def convert_to_safe_tensors(
                     weight_dict[k] = numpy_to_torch(v)
 
                 safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
-
             model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
@@ -544,7 +623,7 @@ def forward(
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
-
+f
         Args:
             input_texts (List(str)): list of sentences.
             max_output_len (int): max generated tokens.
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index 337a0a4e4e77..2012f1b16983 100755
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -38,8 +38,6 @@ def get_config(decoder_type, config):
         "llama": tensorrt_llm.models.llama.config.LLaMAConfig,
         "gpt": tensorrt_llm.models.gpt.config.GPTConfig,
         "gptnext": tensorrt_llm.models.gpt.config.GPTConfig,
-        "falcon": tensorrt_llm.models.falcon.config.FalconConfig,
-        "gemma": tensorrt_llm.models.GemmaConfig,
     }
     config_cls = DECODER_CONFIG[decoder_type] if decoder_type in DECODER_CONFIG else PretrainedConfig
 
@@ -181,6 +179,7 @@ def model_to_trtllm_ckpt(
         'tp_size': tensor_parallel_size,
         'pp_size': pipeline_parallel_size,
     }
+
     model_configs = []
     weights_dicts = []
     num_layers = nemo_model_config.get('num_layers')

From d6351bbbf88dc7afc8c466faf26579376eb9d2a5 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Date: Fri, 23 Aug 2024 00:14:27 -0700
Subject: [PATCH 02/31] Integrating mcore export

---
 nemo/export/tensorrt_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 648a14f1b7b5..1a9d77e4a581 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -623,7 +623,7 @@ def forward(
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
-f
+
         Args:
             input_texts (List(str)): list of sentences.
             max_output_len (int): max generated tokens.

From 996ea059b3e25f645e57475d34940d87cef0b26f Mon Sep 17 00:00:00 2001
From: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Date: Fri, 23 Aug 2024 07:15:20 +0000
Subject: [PATCH 03/31] Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py | 60 ++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 1a9d77e4a581..f62df8249e92 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -309,28 +309,34 @@ def export(
                 USE_NEW_CODE = True
 
                 if USE_NEW_CODE:
-                    from megatron.core.export.model_type import ModelType
-                    from megatron.core.export.model_config import ModelConfig
                     from megatron.core.export.data_type import DataType
                     from megatron.core.export.export_config import ExportConfig
+                    from megatron.core.export.model_config import ModelConfig
+                    from megatron.core.export.model_type import ModelType
+                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+                        DEFAULT_CONVERSION_DICT,
+                    )
                     from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
-                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT
                     from tensorrt_llm.layers import MoeConfig
 
                     def get_model_config(nemo_model_config):
                         conf = ModelConfig()
-                        conf.share_embeddings_and_output_weights = nemo_model_config.get("share_embeddings_and_output_weights", False)
+                        conf.share_embeddings_and_output_weights = nemo_model_config.get(
+                            "share_embeddings_and_output_weights", False
+                        )
                         conf.activation = nemo_model_config.get('activation')
                         conf.nemo_model_config = nemo_model_config.get('num_moe_experts', 0)
                         conf.num_layers = nemo_model_config.get('num_layers')
-                        conf.moe_router_topk =  nemo_model_config.get('moe_router_topk', 0)
+                        conf.moe_router_topk = nemo_model_config.get('moe_router_topk', 0)
                         conf.num_attention_heads = nemo_model_config.get('num_attention_heads')
-                        conf.num_query_groups = nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads'])
+                        conf.num_query_groups = nemo_model_config.get(
+                            'num_query_groups', nemo_model_config['num_attention_heads']
+                        )
                         conf.kv_channels = nemo_model_config.get("kv_channels", None)
                         conf.hidden_size = nemo_model_config.get('hidden_size')
                         conf.ffn_hidden_size = nemo_model_config.get('ffn_hidden_size')
                         conf.layernorm_epsilon = nemo_model_config.get('layernorm_epsilon')
-                        conf.position_embedding_type =  nemo_model_config.get('position_embedding_type')
+                        conf.position_embedding_type = nemo_model_config.get('position_embedding_type')
                         conf.max_position_embeddings = nemo_model_config.get('max_position_embeddings')
                         conf.bias = nemo_model_config.get('bias')
                         conf.rotary_percentage = nemo_model_config.get('rotary_percentage', 1.0)
@@ -342,25 +348,43 @@ def get_model_config(nemo_model_config):
                         conf.moe_tp_mode = nemo_model_config.get('moe_tp_mode', 2)
                         conf.seq_len_interpolation_factor = nemo_model_config.get("seq_len_interpolation_factor")
                         conf.mcore_gpt = nemo_model_config.get("mcore_gpt", False)
-                        conf.share_embeddings_and_output_weights = nemo_model_config.get("share_embeddings_and_output_weights", False)
+                        conf.share_embeddings_and_output_weights = nemo_model_config.get(
+                            "share_embeddings_and_output_weights", False
+                        )
                         conf.apply_embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
                         conf.multi_query_mode = nemo_model_config.get("multi_query_mode", False)
                         conf.normalization = nemo_model_config.get("normalization", "")
                         conf.precision = nemo_model_config.get("precision")
                         return conf
-                    
-                    input_model_config = get_model_config(model_configs)       
+
+                    input_model_config = get_model_config(model_configs)
                     input_model_type = getattr(ModelType, model_type)
                     mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
-                    nemo_model_conversion_dict = {f'model.{key}':value for key, value in mcore_model_conversion_dict.items()}
-                    trtllm_helper = TRTLLMHelper(input_model_config, input_model_type, trtllm_conversion_dict = nemo_model_conversion_dict)
+                    nemo_model_conversion_dict = {
+                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    }
+                    trtllm_helper = TRTLLMHelper(
+                        input_model_config, input_model_type, trtllm_conversion_dict=nemo_model_conversion_dict
+                    )
 
                     input_dtype = getattr(DataType, dtype)
-                    export_config = ExportConfig(tensor_parallelism_size, pipeline_parallelism_size, use_parallel_embedding, use_embedding_sharing, gpus_per_node)
-                   
-                    trtllm_model_weights_list, trtllm_model_config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(model_state_dict = model, export_config = export_config, dtype = input_dtype)
+                    export_config = ExportConfig(
+                        tensor_parallelism_size,
+                        pipeline_parallelism_size,
+                        use_parallel_embedding,
+                        use_embedding_sharing,
+                        gpus_per_node,
+                    )
 
-                    for trtllm_model_weights, trtllm_model_config in zip(trtllm_model_weights_list, trtllm_model_config_list):
+                    trtllm_model_weights_list, trtllm_model_config_list = (
+                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                            model_state_dict=model, export_config=export_config, dtype=input_dtype
+                        )
+                    )
+
+                    for trtllm_model_weights, trtllm_model_config in zip(
+                        trtllm_model_weights_list, trtllm_model_config_list
+                    ):
                         trtllm_helper.build_and_save_engine(
                             max_input_len=max_input_len,
                             max_output_len=max_output_len,
@@ -388,12 +412,12 @@ def get_model_config(nemo_model_config):
                             gpt_attention_plugin=gpt_attention_plugin,
                             gemm_plugin=gemm_plugin,
                         )
-                else : 
+                else:
                     if model_type == "gpt" or model_type == "starcoder":
                         model_type = "gptnext"
 
                     if model_type == "mixtral":
-                        model_type = "llama"                   
+                        model_type = "llama"
                     weights_dicts, model_configs = model_to_trtllm_ckpt(
                         model=model,
                         nemo_model_config=model_configs,

From 7c0584a88e21d287bf7ed5d8262515b2cde91894 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Thu, 22 Aug 2024 20:11:30 -0700
Subject: [PATCH 04/31] Move trt imports in nemo.collections.llm inside
 respective functions (#10234)

Signed-off-by: Hemil Desai <hemild@nvidia.com>
---
 nemo/collections/llm/api.py | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 46d94d26b03b..8bead26e653e 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -8,25 +8,10 @@
 from typing_extensions import Annotated
 
 from nemo.collections.llm.utils import Config, task
-from nemo.deploy import DeployPyTriton
 from nemo.lightning import AutoResume, NeMoLogger, OptimizerModule, Trainer, io
 from nemo.lightning.pytorch.callbacks import PEFT, ModelTransform
 from nemo.utils import logging
 
-trt_llm_supported = True
-try:
-    from nemo.export.tensorrt_llm import TensorRTLLM
-except ImportError as error:
-    logging.warning(f"TensorRTLLM could not be imported from nemo.export: {error}")
-    trt_llm_supported = False
-
-uvicorn_supported = True
-try:
-    import uvicorn
-except ImportError as error:
-    logging.warning(f"uvicorn could not be imported: {error}")
-    uvicorn_supported = False
-
 TokenizerType = Any
 
 
@@ -253,6 +238,8 @@ def get_trtllm_deployable(
     max_batch_size,
     dtype,
 ):
+    from nemo.export.tensorrt_llm import TensorRTLLM
+
     if triton_model_repository is None:
         trt_llm_path = "/tmp/trt_llm_model_dir/"
         Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
@@ -274,8 +261,6 @@ def get_trtllm_deployable(
     if nemo_checkpoint is not None and model_type is None:
         raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
 
-    if not trt_llm_supported:
-        raise ValueError("TensorRT-LLM engine is not supported in this environment.")
     trt_llm_exporter = TensorRTLLM(
         model_dir=trt_llm_path,
         load_model=(nemo_checkpoint is None),
@@ -334,6 +319,8 @@ def deploy(
     rest_service_port: int = 8000,
     openai_format_response: bool = False,
 ):
+    from nemo.deploy import DeployPyTriton
+
     if start_rest_service:
         if triton_port == rest_service_port:
             logging.error("REST service port and Triton server port cannot use the same port.")
@@ -370,6 +357,13 @@ def deploy(
         logging.error("Error message has occurred during deploy function. Error message: " + str(error))
         return
 
+    uvicorn_supported = True
+    try:
+        import uvicorn
+    except ImportError as error:
+        logging.warning(f"uvicorn could not be imported: {error}")
+        uvicorn_supported = False
+
     try:
         logging.info("Model serving on Triton is will be started.")
         if start_rest_service and uvicorn_supported:

From c34d29a8add2fba5f16159f5429c8e9990374d99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Fri, 23 Aug 2024 09:56:30 -0400
Subject: [PATCH 05/31] Add tests for LazyNeMoIterator and fix case with
 metadata_only=True and offsets in manifest (#10198)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add tests for LazyNeMoIterator and fix case with manifest_only=True and offsets in manifest

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* Address code review

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

* fix tests

Signed-off-by: Piotr Żelasko <petezor@gmail.com>

---------

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 .../common/data/lhotse/nemo_adapters.py       |  55 +++--
 .../common/test_lhotse_nemo_adapters.py       | 188 ++++++++++++++++++
 2 files changed, 228 insertions(+), 15 deletions(-)
 create mode 100644 tests/collections/common/test_lhotse_nemo_adapters.py

diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 2a4b71a18880..3c5ced5d4018 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -24,7 +24,7 @@
 import lhotse.serialization
 import soundfile
 from cytoolz import groupby
-from lhotse import AudioSource, Recording, SupervisionSegment
+from lhotse import AudioSource, MonoCut, Recording, SupervisionSegment
 from lhotse.audio.backend import LibsndfileBackend
 from lhotse.cut import Cut
 from lhotse.dataset.dataloading import resolve_seed
@@ -112,11 +112,9 @@ def __iter__(self) -> Generator[Cut, None, None]:
             audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path))
             duration = data.pop("duration")
             offset = data.pop("offset", None)
-            recording = self._create_recording(audio_path, duration, data.pop("sampling_rate", None))
-            cut = recording.to_cut()
-            if offset is not None:
-                cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
-                cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
+            cut = self._create_cut(
+                audio_path=audio_path, offset=offset, duration=duration, sampling_rate=data.pop("sampling_rate", None)
+            )
             # Note that start=0 and not start=offset because supervision's start if relative to the
             # start of the cut; and cut.start is already set to offset
             cut.supervisions.append(
@@ -140,6 +138,42 @@ def __len__(self) -> int:
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
+    def _create_cut(
+        self,
+        audio_path: str,
+        offset: float,
+        duration: float,
+        sampling_rate: int | None = None,
+    ) -> Cut:
+        if not self.metadata_only:
+            recording = self._create_recording(audio_path, duration, sampling_rate)
+            cut = recording.to_cut()
+            if offset is not None:
+                cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
+                cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
+        else:
+            # Only metadata requested.
+            # We'll provide accurate metadata for Cut but inaccurate metadata for Recording to avoid
+            # incurring IO penalty (note that Lhotse manifests contain more information than
+            # NeMo manifests, so for actual dataloading we have to fill it using the audio file).
+            sr = ifnone(sampling_rate, 16000)  # fake sampling rate
+            offset = ifnone(offset, 0.0)
+            cut = MonoCut(
+                id=audio_path,
+                start=offset,
+                duration=duration,
+                channel=0,
+                supervisions=[],
+                recording=Recording(
+                    id=audio_path,
+                    sources=[AudioSource(type="dummy", channels=[0], source="")],
+                    sampling_rate=sr,
+                    duration=offset + duration,
+                    num_samples=compute_num_samples(offset + duration, sr),
+                ),
+            )
+        return cut
+
     def _create_recording(
         self,
         audio_path: str,
@@ -156,15 +190,6 @@ def _create_recording(
                 duration=duration,
                 channel_ids=[0],
             )
-        elif self.metadata_only:
-            return Recording(
-                id=audio_path,
-                sources=[AudioSource(type="file", channels=[0], source=audio_path)],
-                sampling_rate=-1,
-                num_samples=-1,
-                duration=duration,
-                channel_ids=[0],
-            )
         else:
             return Recording.from_file(audio_path)
 
diff --git a/tests/collections/common/test_lhotse_nemo_adapters.py b/tests/collections/common/test_lhotse_nemo_adapters.py
new file mode 100644
index 000000000000..a76116b10dd7
--- /dev/null
+++ b/tests/collections/common/test_lhotse_nemo_adapters.py
@@ -0,0 +1,188 @@
+import numpy as np
+import pytest
+from lhotse import AudioSource, CutSet, MonoCut, Recording, SupervisionSegment
+from lhotse.serialization import save_to_jsonl
+from lhotse.testing.dummies import DummyManifest
+
+from nemo.collections.common.data.lhotse.nemo_adapters import LazyNeMoIterator
+
+
+@pytest.fixture
+def nemo_manifest_path(tmp_path_factory):
+    """2 utterances of length 1s as a NeMo manifest."""
+    tmpdir = tmp_path_factory.mktemp("nemo_data")
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True).save_audios(tmpdir, progress_bar=False)
+    nemo = []
+    for c in cuts:
+        nemo.append(
+            {
+                "audio_filepath": c.recording.sources[0].source,
+                "text": "irrelevant",
+                "duration": c.duration,
+                "lang": "en",
+            }
+        )
+    p = tmpdir / "nemo_manifest.json"
+    save_to_jsonl(nemo, p)
+    return p
+
+
+def test_lazy_nemo_iterator(nemo_manifest_path):
+    cuts = CutSet(LazyNeMoIterator(nemo_manifest_path))
+
+    assert len(cuts) == 2
+
+    for c in cuts:
+        assert isinstance(c, MonoCut)
+        assert c.start == 0.0
+        assert c.duration == 1.0
+        assert c.num_channels == 1
+        assert c.sampling_rate == 16000
+        assert c.num_samples == 16000
+
+        assert c.has_recording
+        assert isinstance(c.recording, Recording)
+        assert c.recording.duration == 1.0
+        assert c.recording.num_channels == 1
+        assert c.recording.num_samples == 16000
+        assert len(c.recording.sources) == 1
+        assert isinstance(c.recording.sources[0], AudioSource)
+        assert c.recording.sources[0].type == "file"
+
+        audio = c.load_audio()
+        assert isinstance(audio, np.ndarray)
+        assert audio.shape == (1, 16000)
+        assert audio.dtype == np.float32
+
+        assert len(c.supervisions) == 1
+        s = c.supervisions[0]
+        assert isinstance(s, SupervisionSegment)
+        assert s.start == 0
+        assert s.duration == 1
+        assert s.channel == 0
+        assert s.text == "irrelevant"
+        assert s.language == "en"
+
+
+@pytest.fixture
+def nemo_offset_manifest_path(tmp_path_factory):
+    """
+    4 utterances of length 0.5s as a NeMo manifest.
+    They are dervied from two audio files of 1s duration, so
+    two of them have offset 0 and the other two have offset 0.5.
+    """
+    tmpdir = tmp_path_factory.mktemp("nemo_data_offset")
+    cuts = (
+        DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True)
+        .save_audios(tmpdir, progress_bar=False)
+        .cut_into_windows(duration=0.5, hop=0.5)
+    )
+    nemo = []
+    for c in cuts:
+        nemo.append(
+            {
+                "audio_filepath": c.recording.sources[0].source,
+                "text": "irrelevant",
+                "offset": c.start,
+                "duration": c.duration,
+                "lang": "en",
+            }
+        )
+    p = tmpdir / "nemo_manifest.json"
+    save_to_jsonl(nemo, p)
+    return p
+
+
+def test_lazy_nemo_iterator_with_offset(nemo_offset_manifest_path):
+    cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path))
+
+    assert len(cuts) == 4
+
+    for idx, c in enumerate(cuts):
+        # Note we originally had 1 cut per 1s audio file.
+        # Then we cut them into 0.5s cuts, so we have 4 cuts in total,
+        # 2 of them start at 0s and the other 2 start at 0.5s.
+        is_even = idx % 2 == 0
+
+        assert isinstance(c, MonoCut)
+        if is_even:
+            assert c.start == 0.0
+        else:
+            assert c.start == 0.5
+        assert c.duration == 0.5
+        assert c.num_channels == 1
+        assert c.sampling_rate == 16000
+        assert c.num_samples == 8000
+
+        assert c.has_recording
+        assert isinstance(c.recording, Recording)
+        assert c.recording.duration == 1.0
+        assert c.recording.num_channels == 1
+        assert c.recording.num_samples == 16000
+        assert len(c.recording.sources) == 1
+        assert isinstance(c.recording.sources[0], AudioSource)
+        assert c.recording.sources[0].type == "file"
+
+        audio = c.load_audio()
+        assert isinstance(audio, np.ndarray)
+        assert audio.shape == (1, 8000)
+        assert audio.dtype == np.float32
+
+        assert len(c.supervisions) == 1
+        s = c.supervisions[0]
+        assert isinstance(s, SupervisionSegment)
+        assert s.start == 0
+        assert s.duration == 0.5
+        assert s.channel == 0
+        assert s.text == "irrelevant"
+        assert s.language == "en"
+
+
+def test_lazy_nemo_iterator_with_offset_metadata_only(nemo_offset_manifest_path):
+    cuts = CutSet(LazyNeMoIterator(nemo_offset_manifest_path, metadata_only=True))
+
+    assert len(cuts) == 4
+
+    for idx, c in enumerate(cuts):
+        # Note we originally had 1 cut per 1s audio file.
+        # Then we cut them into 0.5s cuts, so we have 4 cuts in total,
+        # 2 of them start at 0s and the other 2 start at 0.5s.
+        is_even = idx % 2 == 0
+
+        assert isinstance(c, MonoCut)
+        if is_even:
+            assert c.start == 0.0
+        else:
+            assert c.start == 0.5
+        assert c.duration == 0.5
+        assert c.num_channels == 1
+        assert c.sampling_rate == 16000
+        assert c.num_samples == 8000
+
+        # With metadata_only=True we can't actually check what's in the Recording.
+        # The metadata for it may be incorrect (but is correct for the actual Cut),
+        # but we don't have to perform any I/O to read the file for info.
+        assert c.has_recording
+        assert isinstance(c.recording, Recording)
+        if is_even:
+            assert c.recording.duration == 0.5
+            assert c.recording.num_samples == 8000
+        else:
+            assert c.recording.duration == 1.0
+            assert c.recording.num_samples == 16000
+        assert c.recording.num_channels == 1
+        assert len(c.recording.sources) == 1
+        assert isinstance(c.recording.sources[0], AudioSource)
+        assert c.recording.sources[0].type == "dummy"
+
+        with pytest.raises(AssertionError):
+            c.load_audio()
+
+        assert len(c.supervisions) == 1
+        s = c.supervisions[0]
+        assert isinstance(s, SupervisionSegment)
+        assert s.start == 0
+        assert s.duration == 0.5
+        assert s.channel == 0
+        assert s.text == "irrelevant"
+        assert s.language == "en"

From 3aa1e5c85e1313db4fa54ddd8870993267def303 Mon Sep 17 00:00:00 2001
From: Anna Shors <71393111+ashors1@users.noreply.github.com>
Date: Fri, 23 Aug 2024 07:14:01 -0700
Subject: [PATCH 06/31] [NeMo-UX] Fix a serialization bug that prevents users
 from moving checkpoints (#9939)

* perfor serialization using relative paths to allow users to move checkpoints after they're saved

Signed-off-by: ashors1 <ashors@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ashors1 <ashors1@users.noreply.github.com>

* remove unused import

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix artifact load

Signed-off-by: ashors1 <ashors@nvidia.com>

* fix path artifact

Signed-off-by: ashors1 <ashors@nvidia.com>

* remove unused import

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
---
 nemo/lightning/io/api.py             |  50 +------------
 nemo/lightning/io/artifact/base.py   |   2 +-
 nemo/lightning/io/artifact/file.py   |  15 ++--
 nemo/lightning/io/artifact/pickle.py |   8 +--
 nemo/lightning/io/mixin.py           | 103 +++++++++++++++++++++++----
 5 files changed, 103 insertions(+), 75 deletions(-)

diff --git a/nemo/lightning/io/api.py b/nemo/lightning/io/api.py
index 4d31f020c44a..4315b3211bf7 100644
--- a/nemo/lightning/io/api.py
+++ b/nemo/lightning/io/api.py
@@ -1,61 +1,13 @@
-import json
 from pathlib import Path
-from pydoc import locate
 from typing import Any, Callable, Optional, Type, TypeVar
 
 import fiddle as fdl
 import pytorch_lightning as pl
 from fiddle._src.experimental import serialization
 
-from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector, track_io
+from nemo.lightning.io.mixin import ConnectorMixin, ConnT, ModelConnector, load
 from nemo.lightning.io.pl import TrainerContext
 
-CkptType = TypeVar("CkptType")
-
-
-def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType:
-    """
-    Loads a configuration from a pickle file and constructs an object of the specified type.
-
-    Args:
-        path (Path): The path to the pickle file or directory containing 'io.pkl'.
-        output_type (Type[CkptType]): The type of the object to be constructed from the loaded data.
-
-    Returns
-    -------
-        CkptType: An instance of the specified type constructed from the loaded configuration.
-
-    Raises
-    ------
-        FileNotFoundError: If the specified file does not exist.
-
-    Example:
-        loaded_model = load("/path/to/model", output_type=MyModel)
-    """
-    del output_type  # Just for type-hint
-
-    _path = Path(path)
-    if hasattr(_path, 'is_dir') and _path.is_dir():
-        _path = Path(_path) / "io.json"
-    elif hasattr(_path, 'isdir') and _path.isdir:
-        _path = Path(_path) / "io.json"
-
-    if not _path.is_file():
-        raise FileNotFoundError(f"No such file: '{_path}'")
-
-    ## add IO functionality to custom objects present in the json file
-    with open(_path) as f:
-        j = json.load(f)
-        for obj, val in j["objects"].items():
-            clss = ".".join([val["type"]["module"], val["type"]["name"]])
-            if not serialization.find_node_traverser(locate(clss)):
-                track_io(locate(clss))
-
-    with open(_path, "rb") as f:
-        config = serialization.load_json(f.read())
-
-    return fdl.build(config)
-
 
 def load_context(path: Path) -> TrainerContext:
     """
diff --git a/nemo/lightning/io/artifact/base.py b/nemo/lightning/io/artifact/base.py
index 9119b2474b17..a997df42f843 100644
--- a/nemo/lightning/io/artifact/base.py
+++ b/nemo/lightning/io/artifact/base.py
@@ -11,7 +11,7 @@ def __init__(self, attr: str, required: bool = True):
         self.required = required
 
     @abstractmethod
-    def dump(self, value: ValueT, path: Path) -> ValueT:
+    def dump(self, value: ValueT, absolute_dir: Path, relative_dir: Path) -> ValueT:
         pass
 
     @abstractmethod
diff --git a/nemo/lightning/io/artifact/file.py b/nemo/lightning/io/artifact/file.py
index 0bd4f48dc17f..76bd0c6003a6 100644
--- a/nemo/lightning/io/artifact/file.py
+++ b/nemo/lightning/io/artifact/file.py
@@ -6,8 +6,8 @@
 
 
 class PathArtifact(Artifact[Path]):
-    def dump(self, value: Path, path: Path) -> Path:
-        new_value = copy_file(value, path)
+    def dump(self, value: Path, absolute_dir: Path, relative_dir: Path) -> Path:
+        new_value = copy_file(value, absolute_dir, relative_dir)
         return new_value
 
     def load(self, path: Path) -> Path:
@@ -15,15 +15,16 @@ def load(self, path: Path) -> Path:
 
 
 class FileArtifact(Artifact[str]):
-    def dump(self, value: str, path: Path) -> str:
-        new_value = copy_file(value, path)
+    def dump(self, value: str, absolute_dir: Path, relative_dir: Path) -> str:
+        new_value = copy_file(value, absolute_dir, relative_dir)
         return str(new_value)
 
     def load(self, path: str) -> str:
         return path
 
 
-def copy_file(src: Union[Path, str], dst: Union[Path, str]):
-    output = Path(dst) / Path(src).name
+def copy_file(src: Union[Path, str], path: Union[Path, str], relative_dst: Union[Path, str]):
+    relative_path = Path(relative_dst) / Path(src).name
+    output = Path(path) / relative_path
     shutil.copy2(src, output)
-    return output
+    return relative_path
diff --git a/nemo/lightning/io/artifact/pickle.py b/nemo/lightning/io/artifact/pickle.py
index 31ed7e36ac93..61a9c82237fc 100644
--- a/nemo/lightning/io/artifact/pickle.py
+++ b/nemo/lightning/io/artifact/pickle.py
@@ -7,12 +7,12 @@
 
 
 class PickleArtifact(Artifact[Any]):
-    def dump(self, value: Any, path: Path) -> Path:
-        file = self.file_path(path)
-        with open(file, "wb") as f:
+    def dump(self, absolute_dir: Path, relative_dir: Path) -> Path:
+        relative_file = self.file_path(relative_dir)
+        with open(Path(absolute_dir) / relative_file, "wb") as f:
             dump(value, f)
 
-        return file
+        return relative_file
 
     def load(self, path: Path) -> Any:
         with open(self.file_path(path), "rb") as f:
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index d0d4d0243ff7..eff4cd9434ce 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -1,5 +1,6 @@
 import functools
 import inspect
+import json
 import shutil
 import threading
 import types
@@ -7,11 +8,13 @@
 from copy import deepcopy
 from dataclasses import is_dataclass
 from pathlib import Path
+from pydoc import locate
 from typing import Any, Callable, Dict, List, Optional, Type, TypeVar, Union
 
 import fiddle as fdl
 import fiddle._src.experimental.dataclasses as fdl_dc
-from cloudpickle import dump, load
+from cloudpickle import dump
+from cloudpickle import load as pickle_load
 from fiddle._src.experimental import serialization
 from typing_extensions import Self
 
@@ -21,6 +24,7 @@
 from nemo.lightning.io.fdl_torch import enable as _enable_ext
 
 ConnT = TypeVar('ConnT', bound=ModelConnector)
+CkptType = TypeVar("CkptType")
 _enable_ext()
 
 
@@ -136,21 +140,24 @@ def io_dump(self, output: Path):
                            will be stored.
         """
         output_path = Path(output)
-        artifacts_dir = output_path / "artifacts"
+        local_artifacts_dir = "artifacts"
+        artifacts_dir = output_path / local_artifacts_dir
         artifacts_dir.mkdir(parents=True, exist_ok=True)
 
         # Store artifacts directory in thread-local storage
-        _thread_local.artifacts_dir = artifacts_dir
+        _thread_local.local_artifacts_dir = local_artifacts_dir
+        _thread_local.output_path = output_path
 
         config_path = output_path / "io.json"
         with open(config_path, "w") as f:
             io = deepcopy(self.__io__)
-            _artifact_transform(io, artifacts_dir)
+            _artifact_transform_save(io, output_path, local_artifacts_dir)
             json = serialization.dump_json(io)
             f.write(json)
 
         # Clear thread-local storage after io_dump is complete
-        del _thread_local.artifacts_dir
+        del _thread_local.local_artifacts_dir
+        del _thread_local.output_path
 
         # Check if artifacts directory is empty and delete if so
         if not any(artifacts_dir.iterdir()):
@@ -481,23 +488,28 @@ def _io_flatten_object(instance):
     try:
         serialization.dump_json(instance.__io__)
     except (serialization.UnserializableValueError, AttributeError) as e:
-        if not hasattr(_thread_local, "artifacts_dir"):
+        if not hasattr(_thread_local, "local_artifacts_dir") or not hasattr(_thread_local, "output_path"):
             raise e
 
-        artifact_dir = _thread_local.artifacts_dir
-        artifact_path = artifact_dir / f"{uuid.uuid4()}"
+        local_artifact_path = Path(_thread_local.local_artifacts_dir) / f"{uuid.uuid4()}"
+        output_path = _thread_local.output_path
+        artifact_path = output_path / local_artifact_path
         with open(artifact_path, "wb") as f:
             dump(getattr(instance, "__io__", instance), f)
-        return (str(artifact_path),), None
+        return (str(local_artifact_path),), None
 
     return instance.__io__.__flatten__()
 
 
 def _io_unflatten_object(values, metadata):
+
+    assert hasattr(_thread_local, "output_dir")
+    output_dir = _thread_local.output_dir
+
     if len(values) == 1:
         pickle_path = values[0]
-        with open(pickle_path, "rb") as f:
-            return load(f)
+        with open(Path(output_dir) / pickle_path, "rb") as f:
+            return pickle_load(f)
 
     return fdl.Config.__unflatten__(values, metadata)
 
@@ -511,19 +523,82 @@ def _io_path_elements_fn(x):
     return x.__io__.__path_elements__()
 
 
-def _artifact_transform(cfg: fdl.Config, output_path: Path):
+def _artifact_transform_save(cfg: fdl.Config, output_path: Path, relative_dir: Path = "artifacts"):
     for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
         current_val = getattr(cfg, artifact.attr)
         if current_val is None:
             if artifact.required:
                 raise ValueError(f"Artifact '{artifact.attr}' is required but not provided")
             continue
-        new_val = artifact.dump(current_val, output_path)
+        ## dump artifact and return the relative path
+        new_val = artifact.dump(current_val, output_path, relative_dir)
         setattr(cfg, artifact.attr, new_val)
 
     for attr in dir(cfg):
         try:
             if isinstance(getattr(cfg, attr), fdl.Config):
-                _artifact_transform(getattr(cfg, attr), output_path=output_path)
+                _artifact_transform_save(getattr(cfg, attr), output_path=output_path, relative_dir=relative_dir)
         except ValueError:
             pass
+
+
+def _artifact_transform_load(cfg: fdl.Config, path: Path):
+    for artifact in getattr(cfg.__fn_or_cls__, "__io_artifacts__", []):
+        current_val = getattr(cfg, artifact.attr)
+        ## replace local path with absolute one
+        new_val = str(Path(path) / current_val)
+        setattr(cfg, artifact.attr, new_val)
+
+    for attr in dir(cfg):
+        try:
+            if isinstance(getattr(cfg, attr), fdl.Config):
+                _artifact_transform_load(getattr(cfg, attr), path=path)
+        except ValueError:
+            pass
+
+
+def load(path: Path, output_type: Type[CkptType] = Any) -> CkptType:
+    """
+    Loads a configuration from a pickle file and constructs an object of the specified type.
+
+    Args:
+        path (Path): The path to the pickle file or directory containing 'io.pkl'.
+        output_type (Type[CkptType]): The type of the object to be constructed from the loaded data.
+
+    Returns
+    -------
+        CkptType: An instance of the specified type constructed from the loaded configuration.
+
+    Raises
+    ------
+        FileNotFoundError: If the specified file does not exist.
+
+    Example:
+        loaded_model = load("/path/to/model", output_type=MyModel)
+    """
+    del output_type  # Just for type-hint
+
+    _path = Path(path)
+    _thread_local.output_dir = _path
+
+    if hasattr(_path, 'is_dir') and _path.is_dir():
+        _path = Path(_path) / "io.json"
+    elif hasattr(_path, 'isdir') and _path.isdir:
+        _path = Path(_path) / "io.json"
+
+    if not _path.is_file():
+        raise FileNotFoundError(f"No such file: '{_path}'")
+
+    ## add IO functionality to custom objects present in the json file
+    with open(_path) as f:
+        j = json.load(f)
+        for obj, val in j["objects"].items():
+            clss = ".".join([val["type"]["module"], val["type"]["name"]])
+            if not serialization.find_node_traverser(locate(clss)):
+                track_io(locate(clss))
+
+    with open(_path, "rb") as f:
+        config = serialization.load_json(f.read())
+        _artifact_transform_load(config, path)
+
+    return fdl.build(config)

From 57de2882900ad8d585eb9091caf3ed8b9380b6c4 Mon Sep 17 00:00:00 2001
From: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Date: Fri, 23 Aug 2024 11:20:34 -0400
Subject: [PATCH 07/31] Add MemoryProfileCallback (#10166)

* Add MemoryProfileCallback

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Remove reference cycles, save snapshot on specific ranks

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Remove unnecessary imports

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>

* Update docstring

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>

---------

Signed-off-by: Shriya Palsamudram <spalsamudram@nvidia.com>
Signed-off-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
Signed-off-by: Shriya Rishab <69161273+ShriyaPalsamudram@users.noreply.github.com>
Co-authored-by: ShriyaPalsamudram <ShriyaPalsamudram@users.noreply.github.com>
---
 nemo/lightning/pytorch/callbacks/__init__.py  |  2 +
 .../pytorch/callbacks/memory_profiler.py      | 78 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 nemo/lightning/pytorch/callbacks/memory_profiler.py

diff --git a/nemo/lightning/pytorch/callbacks/__init__.py b/nemo/lightning/pytorch/callbacks/__init__.py
index 5b3113dea885..ef31e1078298 100644
--- a/nemo/lightning/pytorch/callbacks/__init__.py
+++ b/nemo/lightning/pytorch/callbacks/__init__.py
@@ -1,4 +1,5 @@
 from nemo.lightning.pytorch.callbacks.ddp_parity_checker import DdpParityChecker
+from nemo.lightning.pytorch.callbacks.memory_profiler import MemoryProfileCallback
 from nemo.lightning.pytorch.callbacks.model_checkpoint import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.model_transform import ModelTransform
 from nemo.lightning.pytorch.callbacks.nsys import NsysCallback
@@ -8,6 +9,7 @@
 from nemo.lightning.pytorch.callbacks.progress_printer import ProgressPrinter
 
 __all__ = [
+    "MemoryProfileCallback",
     "ModelCheckpoint",
     "ModelTransform",
     "PEFT",
diff --git a/nemo/lightning/pytorch/callbacks/memory_profiler.py b/nemo/lightning/pytorch/callbacks/memory_profiler.py
new file mode 100644
index 000000000000..089479637f61
--- /dev/null
+++ b/nemo/lightning/pytorch/callbacks/memory_profiler.py
@@ -0,0 +1,78 @@
+import os
+
+import torch
+from pytorch_lightning.callbacks.callback import Callback
+from torch.utils.viz._cycles import warn_tensor_cycles
+
+from nemo.lightning import io
+from nemo.utils import logging
+from nemo.utils.get_rank import get_rank
+
+
+class MemoryProfileCallback(Callback, io.IOMixin):
+    """
+    This callback enables recording a timeline of memory allocations during training.
+    The generated .pickle profiles can be analyzed at https://pytorch.org/memory_viz
+
+    More info about the profiles can be found [here](https://pytorch.org/blog/understanding-gpu-memory-1/).
+
+    Args:
+        dir (Optional[str]): Directory to store the memory profile dump
+        warn_cycles (Optional[bool]): Whether to enable [reference cycle detection](https://pytorch.org/blog/understanding-gpu-memory-2/)
+        rank (Optional[list[int]]): List of ranks to collect snapshot on, defaults to all if list is empty
+
+    Example:
+        >>> callback = MemoryProfileCallback(dir="/mem_profile", ranks=[0])
+        >>> trainer = Trainer(callbacks=[callback])
+    """
+
+    def __init__(self, dir: str = "/mem_profile", warn_cycles=True, ranks=[]):
+
+        self.dir = dir
+        self.ranks = ranks
+
+        os.makedirs(self.dir, exist_ok=True)
+        logging.info(f"Torch memory profiles will be written to: {self.dir}")
+
+        if warn_cycles:
+            logging.info("Enabling reference cycle detector")
+            warn_tensor_cycles()
+
+    def enable_on_rank(self) -> bool:
+        if not self.ranks:
+            return True
+        return get_rank() in self.ranks
+
+    def setup(self, trainer, pl_module, stage) -> None:
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end
+        We use it here to start recording the memory profiler.
+        """
+
+        if trainer.max_steps > 1000:
+            logging.warning(
+                f"Memory profiling creates snapshots during the entire training process, \
+            where every iteration increases the size of the snapshot. \
+            Try reducing trainer.max_steps to avoid running into issues"
+            )
+
+        if torch.distributed.is_initialized() and self.enable_on_rank():
+            torch.cuda.memory._record_memory_history(max_entries=100000)
+
+    def on_train_end(self, trainer, pl_module) -> None:
+        """PyTorch Lightning hook:
+        https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#on-train-end
+        We use it here to finish memory profiling and write the snapshot.
+        """
+
+        logging.info(
+            f"on_train_batch_end rank: {get_rank()} mem: {torch.cuda.memory_allocated()/1024/1024/1024} / {torch.cuda.max_memory_reserved()/1024/1024/1024}"
+        )
+
+        if torch.distributed.is_initialized() and self.enable_on_rank():
+            rank = get_rank()
+            _snapshot_path = f"{self.dir}/memory_snapshot-rank{rank}.pickle"
+            logging.info(f"Writing memory profile snapshot to {_snapshot_path}")
+            torch.cuda.memory._dump_snapshot(f"{_snapshot_path}")
+            torch.cuda.memory._record_memory_history(enabled=None)
+            logging.info(f"Finished writing memory profile snapshot: {_snapshot_path}")

From 9214a4e13684477a7094fcebb775733fefe09bed Mon Sep 17 00:00:00 2001
From: Dong Hyuk Chang <thomaschang26@tutanota.com>
Date: Fri, 23 Aug 2024 14:47:00 -0400
Subject: [PATCH 08/31] Lower bound transformers to support nemotron (#10240)

Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
---
 requirements/requirements_lightning.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_lightning.txt b/requirements/requirements_lightning.txt
index 1b3397f69033..171abce41f37 100644
--- a/requirements/requirements_lightning.txt
+++ b/requirements/requirements_lightning.txt
@@ -4,6 +4,6 @@ hydra-core>1.3,<=1.3.2
 omegaconf<=2.3
 pytorch-lightning>2.2.1
 torchmetrics>=0.11.0
-transformers
+transformers>=4.44.0
 wandb
 webdataset>=0.2.86

From c690c4f7fcd71b6d0646374b7826bd2e17c41f25 Mon Sep 17 00:00:00 2001
From: Kuray107 <pku9@gatech.edu>
Date: Sat, 24 Aug 2024 00:38:56 -0400
Subject: [PATCH 09/31] [Audio] SSL Pretraining framework for flow-matching
 model for audio processing (#10052)

Flow matching generative model with SSL pretraining framework

Signed-off-by: Pin-Jui Ku <pku@nvidia.com>
Co-authored-by: Kuray107 <Kuray107@users.noreply.github.com>
---
 examples/audio/audio_to_audio_train.py        |   4 +
 .../audio/conf/flow_matching_generative.yaml  | 164 ++++++
 .../flow_matching_generative_finetuning.yaml  | 167 ++++++
 ...w_matching_generative_ssl_pretraining.yaml | 171 ++++++
 .../audio/data/audio_to_audio_lhotse.py       |  22 +-
 .../audio/models/audio_to_audio.py            |  33 +-
 nemo/collections/audio/models/enhancement.py  | 269 ++++++++++
 .../audio/modules/ssl_pretrain_masking.py     | 106 ++++
 .../audio/parts/submodules/flow.py            | 252 +++++++++
 .../audio/parts/submodules/transformerunet.py | 507 ++++++++++++++++++
 .../audio/parts/utils/callbacks.py            | 177 ++++++
 11 files changed, 1865 insertions(+), 7 deletions(-)
 create mode 100644 examples/audio/conf/flow_matching_generative.yaml
 create mode 100644 examples/audio/conf/flow_matching_generative_finetuning.yaml
 create mode 100644 examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml
 create mode 100644 nemo/collections/audio/modules/ssl_pretrain_masking.py
 create mode 100644 nemo/collections/audio/parts/submodules/flow.py
 create mode 100644 nemo/collections/audio/parts/submodules/transformerunet.py
 create mode 100644 nemo/collections/audio/parts/utils/callbacks.py

diff --git a/examples/audio/audio_to_audio_train.py b/examples/audio/audio_to_audio_train.py
index b197d2084144..cef46dcf20b6 100644
--- a/examples/audio/audio_to_audio_train.py
+++ b/examples/audio/audio_to_audio_train.py
@@ -34,6 +34,7 @@
 
 from nemo.collections.audio.models.enhancement import (
     EncMaskDecAudioToAudioModel,
+    FlowMatchingAudioToAudioModel,
     PredictiveAudioToAudioModel,
     SchroedingerBridgeAudioToAudioModel,
     ScoreBasedGenerativeAudioToAudioModel,
@@ -50,6 +51,7 @@ class ModelType(str, Enum):
     Predictive = 'predictive'
     ScoreBased = 'score_based'
     SchroedingerBridge = 'schroedinger_bridge'
+    FlowMatching = 'flow_matching'
 
 
 def get_model_class(model_type: ModelType):
@@ -62,6 +64,8 @@ def get_model_class(model_type: ModelType):
         return ScoreBasedGenerativeAudioToAudioModel
     elif model_type == ModelType.SchroedingerBridge:
         return SchroedingerBridgeAudioToAudioModel
+    elif model_type == ModelType.FlowMatching:
+        return FlowMatchingAudioToAudioModel
     else:
         raise ValueError(f'Unknown model type: {model_type}')
 
diff --git a/examples/audio/conf/flow_matching_generative.yaml b/examples/audio/conf/flow_matching_generative.yaml
new file mode 100644
index 000000000000..5f644f328e6d
--- /dev/null
+++ b/examples/audio/conf/flow_matching_generative.yaml
@@ -0,0 +1,164 @@
+name: flow_matching_generative
+
+model:
+  type: flow_matching
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  p_cond: 0.9 # Proability of feeding the conditional input into the model.
+  normalize_input: true # normalize the input signal to 0dBFS
+  max_utts_evaluation_metrics: 500
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 6.14 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 768
+    random_offset: true
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    batch_size: 8
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+  
+  log_config:
+    log_tensorboard: true
+    log_wandb: false
+    max_utts: 8
+    
+  encoder:
+    _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
+    fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet
+    in_channels: 2 # concatenation of single-channel perturbed and noisy
+    out_channels: 1 # single-channel score estimate
+    depth: 24
+    ff_dropout: 0.1
+    time_hidden_dim: 1024
+
+  flow:
+    _target_: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow
+    sigma_start: 1.0
+    sigma_end: 1e-4
+
+  sampler:
+    _target_: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler
+    num_steps: 20
+    time_min: 1e-8
+    time_max: 1.0
+    
+  loss:
+    _target_: nemo.collections.audio.losses.MSELoss
+    ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+      estoi: # output ESTOI
+        _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility
+        fs: ${model.sample_rate}
+        extended: true
+      pesq: # output PESQ
+        _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
+        fs: ${model.sample_rate}
+        mode: wb
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      # scheduler config override
+      warmup_steps: 5000
+      warmup_ratio: null
+      min_lr: 0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: 0.2
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: false  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_pesq
+    mode: max
+    save_top_k: 3
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: test
+    project: gense
diff --git a/examples/audio/conf/flow_matching_generative_finetuning.yaml b/examples/audio/conf/flow_matching_generative_finetuning.yaml
new file mode 100644
index 000000000000..c7ba19aee466
--- /dev/null
+++ b/examples/audio/conf/flow_matching_generative_finetuning.yaml
@@ -0,0 +1,167 @@
+name: flow_matching_generative_finetuning
+
+init_from_nemo_model: null
+init_strict: false
+
+model:
+  type: flow_matching
+  sample_rate: 16000
+  skip_nan_grad: false
+  num_outputs: 1
+  p_cond: 0.9 # Proability of feeding the conditional input into the model.
+  normalize_input: true # normalize the input signal to 0dBFS
+  max_utts_evaluation_metrics: 500
+
+  train_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    audio_duration: 6.14 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 768
+    random_offset: true
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: noisy_filepath
+    target_key: clean_filepath
+    batch_size: 8
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+  
+  log_config:
+    log_tensorboard: true
+    log_wandb: false
+    max_utts: 8
+    
+  encoder:
+    _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
+    fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet
+    in_channels: 2 # concatenation of single-channel perturbed and noisy
+    out_channels: 1 # single-channel score estimate
+    depth: 24
+    ff_dropout: 0.1
+    time_hidden_dim: 1024
+
+  flow:
+    _target_: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow
+    sigma_start: 1.0
+    sigma_end: 1e-4
+
+  sampler:
+    _target_: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler
+    num_steps: 20
+    time_min: 1e-8
+    time_max: 1.0
+    
+  loss:
+    _target_: nemo.collections.audio.losses.MSELoss
+    ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+      estoi: # output ESTOI
+        _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility
+        fs: ${model.sample_rate}
+        extended: true
+      pesq: # output PESQ
+        _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
+        fs: ${model.sample_rate}
+        mode: wb
+    
+  optim:
+    name: adam
+    lr: 1e-4
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      # scheduler config override
+      warmup_steps: 5000
+      warmup_ratio: null
+      min_lr: 0
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: -1 # computed at runtime if not set
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  accumulate_grad_batches: 1
+  gradient_clip_val: 0.2
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: false  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_pesq
+    mode: max
+    save_top_k: 3
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: test
+    project: gense
diff --git a/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml b/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml
new file mode 100644
index 000000000000..7813a9473644
--- /dev/null
+++ b/examples/audio/conf/flow_matching_generative_ssl_pretraining.yaml
@@ -0,0 +1,171 @@
+name: flow_matching_generative_ssl_pretraining
+
+model:
+  type: flow_matching
+  sample_rate: 16000
+  skip_nan_grad: true
+  num_outputs: 1
+  p_cond: 0.9 # Proability of feeding the conditional input into the model.
+  normalize_input: true # normalize the input signal to 0dBFS
+  max_utts_evaluation_metrics: 125
+
+  train_ds:
+    shar_path: ???
+    use_lhotse: true
+    truncate_duration: 4.09 # Number of STFT time frames = 1 + audio_duration // encoder.hop_length = 512
+    truncate_offset_type: random
+    batch_size: 8 # batch size may be increased based on the available memory
+    shuffle: true
+    num_workers: 8
+    pin_memory: true
+
+  validation_ds:
+    manifest_filepath: ???
+    input_key: clean_filepath
+    target_key: clean_filepath
+    random_offset: false
+    batch_size: 8
+    shuffle: false
+    num_workers: 4
+    pin_memory: true
+  
+  log_config:
+    log_tensorboard: true
+    log_wandb: false
+    max_utts: 8
+    
+  encoder:
+    _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram
+    fft_length: 510 # Number of subbands in the STFT = fft_length // 2 + 1 = 256
+    hop_length: 128
+    magnitude_power: 0.5
+    scale: 0.33
+
+  decoder:
+    _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio
+    fft_length: ${model.encoder.fft_length} 
+    hop_length: ${model.encoder.hop_length}
+    magnitude_power: ${model.encoder.magnitude_power}
+    scale: ${model.encoder.scale}
+
+  estimator:
+    _target_: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet
+    in_channels: 2 # concatenation of single-channel perturbed and noisy
+    out_channels: 1 # single-channel score estimate
+    depth: 24
+    ff_dropout: 0.1
+    time_hidden_dim: 1024
+
+  flow:
+    _target_: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow
+    sigma_start: 1.0
+    sigma_end: 1e-4
+
+  sampler:
+    _target_: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler
+    num_steps: 20
+    time_min: 1e-8
+    time_max: 1.0
+    
+  ssl_pretrain_masking:
+    _target_: nemo.collections.audio.modules.ssl_pretrain_masking.SSLPretrainWithMaskedPatch
+    patch_size: 10
+    mask_fraction: 0.7
+    
+  loss:
+    _target_: nemo.collections.audio.losses.MSELoss
+    ndim: 4 # loss is calculated on the score in the encoded domain (batch, channel, dimension, time)
+
+  metrics:
+    val:
+      sisdr: # output SI-SDR
+        _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio
+      estoi: # output ESTOI
+        _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility
+        fs: ${model.sample_rate}
+        extended: true
+      pesq: # output PESQ
+        _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality
+        fs: ${model.sample_rate}
+        mode: wb
+    
+  optim:
+    name: adam
+    lr: 5e-5
+    # optimizer arguments
+    betas: [0.9, 0.999]
+    weight_decay: 0.0
+
+    # scheduler setup
+    sched:
+      name: CosineAnnealing
+      # scheduler config override
+      warmup_steps: 5000
+      warmup_ratio: null
+      min_lr: 1e-5
+
+trainer:
+  devices: -1 # number of GPUs, -1 would use all available GPUs
+  num_nodes: 1
+  max_epochs: -1
+  max_steps: 10000 # needs to be set for shar datasets
+  limit_train_batches: 1000 # number of batches to train on in each pseudo-epoch
+  val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
+  accelerator: auto
+  strategy: ddp
+  use_distributed_sampler: false # required for lhotse
+  accumulate_grad_batches: 1
+  gradient_clip_val: 0.2
+  precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
+  log_every_n_steps: 25  # Interval of logging.
+  enable_progress_bar: true
+  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
+  check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
+  sync_batchnorm: true
+  enable_checkpointing: false  # Provided by exp_manager
+  logger: false  # Provided by exp_manager
+
+exp_manager:
+  exp_dir: null
+  name: ${name}
+
+  # use exponential moving average for model parameters
+  ema:
+      enable: true
+      decay: 0.999  # decay rate
+      cpu_offload: false  # offload EMA parameters to CPU to save GPU memory
+      every_n_steps: 1  # how often to update EMA weights
+      validate_original_weights: false  # use original weights for validation calculation?
+
+  # logging
+  create_tensorboard_logger: true
+
+  # checkpointing
+  create_checkpoint_callback: true
+  checkpoint_callback_params:
+    # in case of multiple validation sets, first one is used
+    monitor: val_pesq
+    mode: max
+    save_top_k: 3
+    always_save_nemo: true # saves the checkpoints as nemo files instead of PTL checkpoints
+
+  # early stopping
+  create_early_stopping_callback: true
+  early_stopping_callback_params:
+    monitor: val_sisdr
+    mode: max
+    min_delta: 0.0
+    patience: 20 # patience in terms of check_val_every_n_epoch
+    verbose: true
+    strict: false # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
+
+  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
+  # you need to set these two to true to continue the training
+  resume_if_exists: false
+  resume_ignore_no_checkpoint: false
+
+  # You may use this section to create a W&B logger
+  create_wandb_logger: false
+  wandb_logger_kwargs:
+    name: null
+    project: null
diff --git a/nemo/collections/audio/data/audio_to_audio_lhotse.py b/nemo/collections/audio/data/audio_to_audio_lhotse.py
index 27d8a0ed28d7..d8978c19d692 100644
--- a/nemo/collections/audio/data/audio_to_audio_lhotse.py
+++ b/nemo/collections/audio/data/audio_to_audio_lhotse.py
@@ -44,19 +44,29 @@ class LhotseAudioToTargetDataset(torch.utils.data.Dataset):
     EMBEDDING_KEY = "embedding_vector"
 
     def __getitem__(self, cuts: CutSet) -> dict[str, torch.Tensor]:
-        src_audio, src_audio_lens = collate_audio(cuts)
+        # In the rare case, the collate_audio function would raise the FileSeek error when loading .flac (https://github.com/bastibe/python-soundfile/issues/274)
+        # A workaround is to use fault_tolerant and skip failed data, resulting in a smaller batch size for the few problematic cases.
+        src_audio, src_audio_lens, retained_padded_cuts = collate_audio(cuts, fault_tolerant=True)
         ans = {
             "input_signal": src_audio,
             "input_length": src_audio_lens,
         }
-        if _key_available(cuts, self.TARGET_KEY):
-            tgt_audio, tgt_audio_lens = collate_audio(cuts, recording_field=self.TARGET_KEY)
+        # keep only the first non-padding cuts
+        retained_cuts = [
+            cut._first_non_padding_cut if isinstance(cut, MixedCut) else cut for cut in retained_padded_cuts
+        ]
+        retained_cuts = CutSet.from_cuts(retained_cuts)
+
+        if _key_available(retained_cuts, self.TARGET_KEY):
+            # TODO: use fault_tolerant=True for robust loading of target
+            tgt_audio, tgt_audio_lens = collate_audio(retained_cuts, recording_field=self.TARGET_KEY)
             ans.update(target_signal=tgt_audio, target_length=tgt_audio_lens)
-        if _key_available(cuts, self.REFERENCE_KEY):
-            ref_audio, ref_audio_lens = collate_audio(cuts, recording_field=self.REFERENCE_KEY)
+        if _key_available(retained_cuts, self.REFERENCE_KEY):
+            # TODO: use fault_tolerant=True for robust loading of target
+            ref_audio, ref_audio_lens = collate_audio(retained_cuts, recording_field=self.REFERENCE_KEY)
             ans.update(reference_signal=ref_audio, reference_length=ref_audio_lens)
         if _key_available(cuts, self.EMBEDDING_KEY):
-            emb = collate_custom_field(cuts, field=self.EMBEDDING_KEY)
+            emb = collate_custom_field(retained_cuts, field=self.EMBEDDING_KEY)
             ans.update(embedding_signal=emb)
         return ans
 
diff --git a/nemo/collections/audio/models/audio_to_audio.py b/nemo/collections/audio/models/audio_to_audio.py
index ef9ce648f1a2..e1732c1658b7 100644
--- a/nemo/collections/audio/models/audio_to_audio.py
+++ b/nemo/collections/audio/models/audio_to_audio.py
@@ -483,4 +483,35 @@ def on_after_backward(self):
 
             if valid_gradients < 1:
                 logging.warning('detected inf or nan values in gradients! Setting gradients to zero.')
-                self.zero_grad()
+                self.zero_grad(set_to_none=False)
+
+    def configure_callbacks(self):
+        """
+        Create an callback to add audio/spectrogram into tensorboard & wandb.
+        """
+        self.log_config = self.cfg.get("log_config", None)
+        if not self.log_config:
+            return []
+
+        log_callbacks = []
+        from nemo.collections.audio.parts.utils.callbacks import SpeechEnhancementLoggingCallback
+
+        if isinstance(self._validation_dl, List):
+            data_loaders = self._validation_dl
+        else:
+            data_loaders = [self._validation_dl]
+
+        for data_loader_idx, data_loader in enumerate(data_loaders):
+            log_callbacks.append(
+                SpeechEnhancementLoggingCallback(
+                    data_loader=data_loader,
+                    data_loader_idx=data_loader_idx,
+                    loggers=self.trainer.loggers,
+                    log_tensorboard=self.log_config.log_tensorboard,
+                    log_wandb=self.log_config.log_wandb,
+                    sample_rate=self.sample_rate,
+                    max_utts=self.log_config.get("max_utts", None),
+                )
+            )
+
+        return log_callbacks
diff --git a/nemo/collections/audio/models/enhancement.py b/nemo/collections/audio/models/enhancement.py
index e7fbc9023117..cd9f47b98096 100644
--- a/nemo/collections/audio/models/enhancement.py
+++ b/nemo/collections/audio/models/enhancement.py
@@ -30,6 +30,7 @@
     'ScoreBasedGenerativeAudioToAudioModel',
     'PredictiveAudioToAudioModel',
     'SchroedingerBridgeAudioToAudioModel',
+    'FlowMatchingAudioToAudioModel',
 ]
 
 
@@ -618,6 +619,274 @@ def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str =
         return {f'{tag}_loss': loss}
 
 
+class FlowMatchingAudioToAudioModel(AudioToAudioModel):
+    """This models uses a flow matching process to generate
+    an encoded representation of the enhanced signal.
+
+    The model consists of the following blocks:
+        - encoder: transforms input multi-channel audio signal into an encoded representation (analysis transform)
+        - estimator: neural model, estimates a score for the diffusion process
+        - flow: ordinary differential equation (ODE) defining a flow and a vector field.
+        - sampler: sampler for the inference process, estimates coefficients of the target signal
+        - decoder: transforms sampler output into the time domain (synthesis transform)
+        - ssl_pretrain_masking: if it is defined, perform the ssl pretrain masking for self reconstruction in the training process
+    """
+
+    def __init__(self, cfg: DictConfig, trainer: Trainer = None):
+        super().__init__(cfg=cfg, trainer=trainer)
+        self.sample_rate = self._cfg.sample_rate
+
+        # Setup processing modules
+        self.encoder = self.from_config_dict(self._cfg.encoder)
+        self.decoder = self.from_config_dict(self._cfg.decoder)
+
+        # Neural estimator
+        self.estimator = self.from_config_dict(self._cfg.estimator)
+
+        # Flow
+        self.flow = self.from_config_dict(self._cfg.flow)
+
+        # Sampler
+        self.sampler = hydra.utils.instantiate(self._cfg.sampler, estimator=self.estimator)
+
+        # probability that the conditional input will be feed into the
+        # estimator in the training stage
+        self.p_cond = self._cfg.get('p_cond', 1.0)
+
+        # Self-Supervised Pretraining
+        if self._cfg.get('ssl_pretrain_masking') is not None:
+            logging.debug('SSL-pretrain_masking is found and will be initialized')
+            self.ssl_pretrain_masking = self.from_config_dict(self._cfg.ssl_pretrain_masking)
+        else:
+            self.ssl_pretrain_masking = None
+
+        # Normalization
+        self.normalize_input = self._cfg.get('normalize_input', False)
+
+        # Metric evaluation
+        self.max_utts_evaluation_metrics = self._cfg.get('max_utts_evaluation_metrics')
+
+        if self.max_utts_evaluation_metrics is not None:
+            logging.warning(
+                'Metrics will be evaluated on first %d examples of the evaluation datasets.',
+                self.max_utts_evaluation_metrics,
+            )
+
+        # Regularization
+        self.eps = self._cfg.get('eps', 1e-8)
+
+        # Setup optional Optimization flags
+        self.setup_optimization_flags()
+
+        logging.debug('Initialized              %s', self.__class__.__name__)
+        logging.debug('\tdoing SSL-pretraining: %s', (self.ssl_pretrain_masking is not None))
+        logging.debug('\tp_cond:                %s', self.p_cond)
+        logging.debug('\tnormalize_input:       %s', self.normalize_input)
+        logging.debug('\tloss:                  %s', self.loss)
+        logging.debug('\teps:                   %s', self.eps)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        return {
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "input_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        return {
+            "output_signal": NeuralType(('B', 'C', 'T'), AudioSignal(freq=self.sample_rate)),
+            "output_length": NeuralType(tuple('B'), LengthsType(), optional=True),
+        }
+
+    @typecheck()
+    @torch.inference_mode()
+    def forward(self, input_signal, input_length=None):
+        """Forward pass of the model to generate samples from the target distribution.
+
+        Args:
+            input_signal: Tensor that represents a batch of raw audio signals,
+                of shape [B, T] or [B, T, C]. T here represents timesteps, with 1 second of audio represented as
+                `self.sample_rate` number of floating point values.
+            input_signal_length: Vector of length B, that contains the individual lengths of the audio
+                sequences.
+
+        Returns:
+            Output signal `output` in the time domain and the length of the output signal `output_length`.
+        """
+        batch_length = input_signal.size(-1)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+
+        # Encoder
+        encoded, encoded_length = self.encoder(input=input_signal, input_length=input_length)
+
+        if self.p_cond == 0:
+            encoded = torch.zeros_like(encoded)
+        elif self.ssl_pretrain_masking is not None:
+            encoded = self.ssl_pretrain_masking(input_spec=encoded, length=encoded_length)
+
+        init_state = torch.randn_like(encoded) * self.flow.sigma_start
+
+        # Sampler
+        generated, generated_length = self.sampler(
+            state=init_state, estimator_condition=encoded, state_length=encoded_length
+        )
+
+        # Decoder
+        output, output_length = self.decoder(input=generated, input_length=generated_length)
+
+        if self.normalize_input:
+            # rescale to the original scale
+            output = output * norm_scale
+
+        # Trim or pad the estimated signal to match input length
+        output = self.match_batch_length(input=output, batch_length=batch_length)
+
+        return output, output_length
+
+    @typecheck(
+        input_types={
+            "target_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_signal": NeuralType(('B', 'C', 'T'), AudioSignal()),
+            "input_length": NeuralType(tuple('B'), LengthsType()),
+        },
+        output_types={
+            "loss": NeuralType(None, LossType()),
+        },
+    )
+    def _step(self, target_signal, input_signal, input_length=None):
+        batch_size = target_signal.size(0)
+
+        if self.normalize_input:
+            # max for each example in the batch
+            norm_scale = torch.amax(input_signal.abs(), dim=(-1, -2), keepdim=True)
+            # scale input signal
+            input_signal = input_signal / (norm_scale + self.eps)
+            # scale the target signal
+            target_signal = target_signal / (norm_scale + self.eps)
+
+        # Apply encoder to both target and the input
+        input_enc, input_enc_len = self.encoder(input=input_signal, input_length=input_length)
+        target_enc, _ = self.encoder(input=target_signal, input_length=input_length)
+
+        # Self-Supervised Pretraining
+        if self.ssl_pretrain_masking is not None:
+            input_enc = self.ssl_pretrain_masking(input_spec=input_enc, length=input_enc_len)
+
+        # Drop off conditional inputs (input_enc) with (1 - p_cond) probability.
+        # The dropped conditions will be set to zeros
+        keep_conditions = einops.rearrange((torch.rand(batch_size) < self.p_cond).float(), 'B -> B 1 1 1')
+        input_enc = input_enc * keep_conditions.to(input_enc.device)
+
+        x_start = torch.zeros_like(input_enc)
+
+        time = self.flow.generate_time(batch_size=batch_size).to(device=input_enc.device)
+        sample = self.flow.sample(time=time, x_start=x_start, x_end=target_enc)
+
+        # we want to get a vector field estimate given current state
+        # at training time, current state is sampled from the conditional path
+        #   the vector field model is also conditioned on input signal
+        estimator_input = torch.cat([sample, input_enc], dim=-3)
+
+        # Estimate the vector  using the neural estimator
+        estimate, estimate_len = self.estimator(input=estimator_input, input_length=input_enc_len, condition=time)
+
+        conditional_vector_field = self.flow.vector_field(time=time, x_start=x_start, x_end=target_enc, point=sample)
+
+        return self.loss(estimate=estimate, target=conditional_vector_field, input_length=input_enc_len)
+
+    # PTL-specific methods
+    def training_step(self, batch, batch_idx):
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch.get('target_signal', input_signal.clone())
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, "B T -> B 1 T")
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, "B T -> B 1 T")
+
+        # Calculate the loss
+        loss = self._step(target_signal=target_signal, input_signal=input_signal, input_length=input_length)
+
+        # Logs
+        self.log('train_loss', loss)
+        self.log('learning_rate', self._optimizer.param_groups[0]['lr'])
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return loss
+
+    def evaluation_step(self, batch, batch_idx, dataloader_idx: int = 0, tag: str = 'val'):
+
+        if isinstance(batch, dict):
+            # lhotse batches are dictionaries
+            input_signal = batch['input_signal']
+            input_length = batch['input_length']
+            target_signal = batch.get('target_signal', input_signal.clone())
+        else:
+            input_signal, input_length, target_signal, _ = batch
+
+        # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+        if input_signal.ndim == 2:
+            input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+        if target_signal.ndim == 2:
+            target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+        # Calculate loss
+        loss = self._step(
+            target_signal=target_signal,
+            input_signal=input_signal,
+            input_length=input_length,
+        )
+
+        # Update metrics
+        update_metrics = False
+        if self.max_utts_evaluation_metrics is None:
+            # Always update if max is not configured
+            update_metrics = True
+            # Number of examples to process
+            num_examples = input_signal.size(0)  # batch size
+        else:
+            # Check how many examples have been used for metric calculation
+            first_metric_name = next(iter(self.metrics[tag][dataloader_idx]))
+            num_examples_evaluated = self.metrics[tag][dataloader_idx][first_metric_name].num_examples
+            # Update metrics if some examples were not processed
+            update_metrics = num_examples_evaluated < self.max_utts_evaluation_metrics
+            # Number of examples to process
+            num_examples = min(self.max_utts_evaluation_metrics - num_examples_evaluated, input_signal.size(0))
+
+        if update_metrics:
+            # Generate output signal
+            output_signal, _ = self.forward(
+                input_signal=input_signal[:num_examples, ...], input_length=input_length[:num_examples]
+            )
+
+            # Update metrics
+            if hasattr(self, 'metrics') and tag in self.metrics:
+                # Update metrics for this (tag, dataloader_idx)
+                for name, metric in self.metrics[tag][dataloader_idx].items():
+                    metric.update(
+                        preds=output_signal,
+                        target=target_signal[:num_examples, ...],
+                        input_length=input_length[:num_examples],
+                    )
+
+        # Log global step
+        self.log('global_step', torch.tensor(self.trainer.global_step, dtype=torch.float32))
+
+        return {f'{tag}_loss': loss}
+
+
 class SchroedingerBridgeAudioToAudioModel(AudioToAudioModel):
     """This models is using a Schrödinger Bridge process to generate
     an encoded representation of the enhanced signal.
diff --git a/nemo/collections/audio/modules/ssl_pretrain_masking.py b/nemo/collections/audio/modules/ssl_pretrain_masking.py
new file mode 100644
index 000000000000..ba0722f180d8
--- /dev/null
+++ b/nemo/collections/audio/modules/ssl_pretrain_masking.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+
+import einops
+import torch
+
+from nemo.core.classes import NeuralModule, typecheck
+from nemo.core.neural_types import LengthsType, NeuralType, SpectrogramType
+
+__all__ = ['SSLPretrainWithMaskedPatch']
+
+
+class SSLPretrainWithMaskedPatch(NeuralModule):
+    """
+    Zeroes out fixed size time patches of the spectrogram.
+    All samples in batch are guaranteed to have the same amount of masked time steps.
+    Note that this may be problematic when we do pretraining on a unbalanced dataset.
+
+    For example, say a batch contains two spectrograms of length 87 and 276.
+    With mask_fraction=0.7 and patch_size=10, we'll obrain mask_patches=7.
+    Each of the two data will then have 7 patches of 10-frame mask.
+
+    Args:
+        patch_size (int): up to how many time steps does one patch consist of.
+            Defaults to 10.
+        mask_fraction (float): how much fraction in each sample to be masked (number of patches is rounded up).
+            Range from 0.0 to 1.0. Defaults to 0.7.
+    """
+
+    @property
+    def input_types(self):
+        """Returns definitions of module input types"""
+        return {
+            "input_spec": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "length": NeuralType(tuple('B'), LengthsType()),
+        }
+
+    @property
+    def output_types(self):
+        """Returns definitions of module output types"""
+        return {"augmented_spec": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType())}
+
+    def __init__(
+        self,
+        patch_size: int = 10,
+        mask_fraction: float = 0.7,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        if mask_fraction > 1.0 or mask_fraction < 0.0:
+            raise ValueError('mask_patches cannot be negative')
+        else:
+            self.mask_fraction = mask_fraction
+
+    @typecheck()
+    def forward(self, input_spec, length):
+        """
+        Apply Patched masking on the input_spec.
+
+
+        During the training stage, the mask is generated randomly, with
+        approximately `self.mask_fraction` of the time frames being masked out.
+
+        In the validation stage, the masking pattern is fixed to ensure
+        consistent evaluation of checkpoints and to prevent overfitting. Note
+        that the same masking pattern is applied to all data, regardless of
+        their lengths. On average, approximately `self.mask_fraction` of the
+        time frames will be masked out.
+
+        """
+        augmented_spec = input_spec
+
+        min_len = torch.min(length)
+        if self.training:
+            len_fraction = int(min_len * self.mask_fraction)
+            mask_patches = len_fraction // self.patch_size + int(len_fraction % self.patch_size != 0)
+
+            if min_len < self.patch_size * mask_patches:
+                mask_patches = min_len // self.patch_size
+
+            for idx, cur_len in enumerate(length.tolist()):
+                patches = range(cur_len // self.patch_size)
+                masked_patches = random.sample(patches, mask_patches)
+                for mp in masked_patches:
+                    augmented_spec[idx, :, :, mp * self.patch_size : (mp + 1) * self.patch_size] = 0.0
+        else:
+            chunk_length = self.patch_size // self.mask_fraction
+            mask = torch.arange(augmented_spec.size(-1), device=augmented_spec.device)
+            mask = (mask % chunk_length) >= self.patch_size
+            mask = einops.rearrange(mask, 'T -> 1 1 1 T').float()
+            augmented_spec = augmented_spec * mask
+
+        return augmented_spec
diff --git a/nemo/collections/audio/parts/submodules/flow.py b/nemo/collections/audio/parts/submodules/flow.py
new file mode 100644
index 000000000000..748d4c6c6d3b
--- /dev/null
+++ b/nemo/collections/audio/parts/submodules/flow.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from typing import Tuple
+
+import einops
+import torch
+
+from nemo.collections.common.parts.utils import mask_sequence_tensor
+from nemo.utils import logging
+
+
+class ConditionalFlow(ABC):
+    """
+    Abstract class for different conditional flow-matching (CFM) classes
+
+    Time horizon is [time_min, time_max (should be 1)]
+
+    every path is "conditioned" on endpoints of the path
+    endpoints are just our paired data samples
+    subclasses need to implement mean, std, and vector_field
+
+    """
+
+    def __init__(self, time_min: float = 1e-8, time_max: float = 1.0):
+        self.time_min = time_min
+        self.time_max = time_max
+
+    @abstractmethod
+    def mean(self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor) -> torch.Tensor:
+        """
+        Return the mean of p_t(x | x_start, x_end) at time t
+        """
+        pass
+
+    @abstractmethod
+    def std(self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor) -> torch.Tensor:
+        """
+        Return the standard deviation of p_t(x | x_start, x_end) at time t
+        """
+        pass
+
+    @abstractmethod
+    def vector_field(
+        self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor, point: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute the conditional vector field v_t( point | x_start, x_end)
+        """
+        pass
+
+    @staticmethod
+    def _broadcast_time(time: torch.Tensor, n_dim: int) -> torch.Tensor:
+        """
+        Broadcast time tensor to the desired number of dimensions
+        """
+        if time.ndim == 1:
+            target_shape = ' '.join(['B'] + ['1'] * (n_dim - 1))
+            time = einops.rearrange(time, f'B -> {target_shape}')
+
+        return time
+
+    def generate_time(self, batch_size: int) -> torch.Tensor:
+        """
+        Randomly sample a batchsize of time_steps from U[0~1]
+        """
+        return torch.clamp(torch.rand((batch_size,)), self.time_min, self.time_max)
+
+    def sample(self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor) -> torch.Tensor:
+        """
+        Generate a sample from p_t(x | x_start, x_end) at time t.
+        Note that this implementation assumes all path marginals are normally distributed.
+        """
+        time = self._broadcast_time(time, n_dim=x_start.ndim)
+
+        mean = self.mean(time=time, x_start=x_start, x_end=x_end)
+        std = self.std(time=time, x_start=x_start, x_end=x_end)
+        return mean + std * torch.randn_like(mean)
+
+    def flow(
+        self, *, time: torch.Tensor, x_start: torch.Tensor, x_end: torch.Tensor, point: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Compute the conditional flow phi_t( point | x_start, x_end).
+        This is an affine flow.
+        """
+        mean = self.mean(time=time, x_start=x_start, x_end=x_end)
+        std = self.std(time=time, x_start=x_start, x_end=x_end)
+        return mean + std * (point - x_start)
+
+
+class OptimalTransportFlow(ConditionalFlow):
+    """The OT-CFM model from [Lipman et at, 2023]
+
+    Every conditional path the following holds:
+    p_0 = N(x_start, sigma_start)
+    p_1 = N(x_end, sigma_end),
+
+    mean(x, t) = (time_max - t) * x_start + t * x_end
+        (linear interpolation between x_start and x_end)
+
+    std(x, t) = (time_max - t) * sigma_start + t * sigma_end
+
+    Every conditional path is optimal transport map from p_0(x_start, x_end) to p_1(x_start, x_end)
+    Marginal path is not guaranteed to be an optimal transport map from p_0 to p_1
+
+    To get the OT-CFM model from [Lipman et at, 2023] just pass zeroes for x_start
+    To get the I-CFM model, set sigma_min=sigma_max
+    To get the rectified flow model, set sigma_min=sigma_max=0
+
+    Args:
+        time_min: minimum time value used in the process
+        time_max: maximum time value used in the process
+        sigma_start: the standard deviation of the initial distribution
+        sigma_end: the standard deviation of the target distribution
+    """
+
+    def __init__(
+        self, time_min: float = 1e-8, time_max: float = 1.0, sigma_start: float = 1.0, sigma_end: float = 1e-4
+    ):
+        super().__init__(time_min=time_min, time_max=time_max)
+        self.sigma_start = sigma_start
+        self.sigma_end = sigma_end
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\ttime_min:       %s', self.time_min)
+        logging.debug('\ttime_max:       %s', self.time_max)
+        logging.debug('\tsgima_start:    %s', self.sigma_start)
+        logging.debug('\tsigma_end:      %s', self.sigma_end)
+
+    def mean(self, *, x_start: torch.Tensor, x_end: torch.Tensor, time: torch.Tensor) -> torch.Tensor:
+        return (self.time_max - time) * x_start + time * x_end
+
+    def std(self, *, x_start: torch.Tensor, x_end: torch.Tensor, time: torch.Tensor) -> torch.Tensor:
+        return (self.time_max - time) * self.sigma_start + time * self.sigma_end
+
+    def vector_field(
+        self,
+        *,
+        x_start: torch.Tensor,
+        x_end: torch.Tensor,
+        time: torch.Tensor,
+        point: torch.Tensor,
+        eps: float = 1e-6,
+    ) -> torch.Tensor:
+        time = self._broadcast_time(time, n_dim=x_start.ndim)
+
+        if self.sigma_start == self.sigma_end:
+            return x_end - x_start
+
+        num = self.sigma_end * (point - x_start) - self.sigma_start * (point - x_end)
+        denom = (1 - time) * self.sigma_start + time * self.sigma_end
+        return num / (denom + eps)
+
+
+class ConditionalFlowMatchingSampler(ABC):
+    """
+    Abstract class for different sampler to solve the ODE in CFM
+
+    Args:
+        estimator: the NN-based conditional vector field estimator
+        num_steps: How many time steps to iterate in the process
+        time_min: minimum time value used in the process
+        time_max: maximum time value used in the process
+
+    """
+
+    def __init__(
+        self,
+        estimator: torch.nn.Module,
+        num_steps: int = 5,
+        time_min: float = 1e-8,
+        time_max: float = 1.0,
+    ):
+        self.estimator = estimator
+        self.num_steps = num_steps
+        self.time_min = time_min
+        self.time_max = time_max
+
+    @property
+    def time_step(self):
+        return (self.time_max - self.time_min) / self.num_steps
+
+    @abstractmethod
+    def forward(
+        self, state: torch.Tensor, estimator_condition: torch.Tensor, state_length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        pass
+
+
+class ConditionalFlowMatchingEulerSampler(ConditionalFlowMatchingSampler):
+    """
+    The Euler Sampler for solving the ODE in CFM on a uniform time grid
+    """
+
+    def __init__(
+        self,
+        estimator: torch.nn.Module,
+        num_steps: int = 5,
+        time_min: float = 1e-8,
+        time_max: float = 1.0,
+    ):
+        super().__init__(
+            estimator=estimator,
+            num_steps=num_steps,
+            time_min=time_min,
+            time_max=time_max,
+        )
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tnum_steps:      %s', self.num_steps)
+        logging.debug('\ttime_min:       %s', self.time_min)
+        logging.debug('\ttime_max:       %s', self.time_max)
+
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+    @torch.inference_mode()
+    def forward(
+        self, state: torch.Tensor, estimator_condition: torch.Tensor, state_length: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        time_steps = torch.linspace(self.time_min, self.time_max, self.num_steps + 1)
+
+        if state_length is not None:
+            state = mask_sequence_tensor(state, state_length)
+
+        for t in time_steps:
+            time = t * torch.ones(state.shape[0], device=state.device)
+
+            if estimator_condition is None:
+                estimator_input = state
+            else:
+                estimator_input = torch.cat([state, estimator_condition], dim=1)
+
+            vector_field, _ = self.estimator(input=estimator_input, input_length=state_length, condition=time)
+
+            state = state + vector_field * self.time_step
+
+            if state_length is not None:
+                state = mask_sequence_tensor(state, state_length)
+
+        return state, state_length
diff --git a/nemo/collections/audio/parts/submodules/transformerunet.py b/nemo/collections/audio/parts/submodules/transformerunet.py
new file mode 100644
index 000000000000..b7c14d513bab
--- /dev/null
+++ b/nemo/collections/audio/parts/submodules/transformerunet.py
@@ -0,0 +1,507 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MIT License
+#
+# Copyright (c) 2023 Phil Wang
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+from functools import partial
+from typing import Dict, Optional
+
+import einops
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import Module
+
+from nemo.core.classes import NeuralModule, typecheck
+from nemo.core.neural_types import BoolType, FloatType, LengthsType, NeuralType, SpectrogramType
+from nemo.utils import logging
+
+__all__ = ['TransformerUNet']
+
+
+class LearnedSinusoidalPosEmb(Module):
+    """The sinusoidal Embedding to encode time conditional information"""
+
+    def __init__(self, dim: int):
+        super().__init__()
+        if (dim % 2) != 0:
+            raise ValueError(f"Input dimension {dim} is not divisible by 2!")
+        half_dim = dim // 2
+        self.weights = nn.Parameter(torch.randn(half_dim))
+
+    def forward(self, t: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+          t: input time tensor, shape (B)
+
+        Return:
+          fouriered: the encoded time conditional embedding, shape (B, D)
+        """
+        t = einops.rearrange(t, 'b -> b 1')
+        freqs = t * einops.rearrange(self.weights, 'd -> 1 d') * 2 * math.pi
+        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
+        return fouriered
+
+
+class ConvPositionEmbed(Module):
+    """The Convolutional Embedding to encode time information of each frame"""
+
+    def __init__(self, dim: int, kernel_size: int, groups: Optional[int] = None):
+        super().__init__()
+        if (kernel_size % 2) == 0:
+            raise ValueError(f"Kernel size {kernel_size} is divisible by 2!")
+
+        if groups is None:
+            groups = dim
+
+        self.dw_conv1d = nn.Sequential(
+            nn.Conv1d(dim, dim, kernel_size, groups=groups, padding=kernel_size // 2), nn.GELU()
+        )
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input tensor, shape (B, T, D)
+
+        Return:
+            out: output tensor with the same shape (B, T, D)
+        """
+
+        if mask is not None:
+            mask = mask[..., None]
+            x = x.masked_fill(mask, 0.0)
+
+        x = einops.rearrange(x, 'b n c -> b c n')
+        x = self.dw_conv1d(x)
+        out = einops.rearrange(x, 'b c n -> b n c')
+
+        if mask is not None:
+            out = out.masked_fill(mask, 0.0)
+
+        return out
+
+
+class RMSNorm(Module):
+    """The Root Mean Square Layer Normalization
+
+    References:
+      - Zhang et al., Root Mean Square Layer Normalization, 2019
+    """
+
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor):
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+
+
+class AdaptiveRMSNorm(Module):
+    """
+    Adaptive Root Mean Square Layer Normalization given a conditional embedding.
+    This enables the model to consider the conditional input during normalization.
+    """
+
+    def __init__(self, dim: int, cond_dim: Optional[int] = None):
+        super().__init__()
+        if cond_dim is None:
+            cond_dim = dim
+        self.scale = dim**0.5
+
+        self.to_gamma = nn.Linear(cond_dim, dim)
+        self.to_beta = nn.Linear(cond_dim, dim)
+
+        # init adaptive normalization to identity
+
+        nn.init.zeros_(self.to_gamma.weight)
+        nn.init.ones_(self.to_gamma.bias)
+
+        nn.init.zeros_(self.to_beta.weight)
+        nn.init.zeros_(self.to_beta.bias)
+
+    def forward(self, x: torch.Tensor, cond: torch.Tensor):
+        normed = F.normalize(x, dim=-1) * self.scale
+
+        gamma, beta = self.to_gamma(cond), self.to_beta(cond)
+        gamma = einops.rearrange(gamma, 'B D -> B 1 D')
+        beta = einops.rearrange(beta, 'B D -> B 1 D')
+
+        return normed * gamma + beta
+
+
+class GEGLU(Module):
+    """The GeGLU activation implementation"""
+
+    def forward(self, x: torch.Tensor):
+        x, gate = x.chunk(2, dim=-1)
+        return F.gelu(gate) * x
+
+
+def get_feedforward_layer(dim: int, mult: int = 4, dropout: float = 0.0):
+    """
+    Return a Feed-Forward layer for the Transformer Layer.
+    GeGLU activation is used in this FF layer
+    """
+    dim_inner = int(dim * mult * 2 / 3)
+    return nn.Sequential(nn.Linear(dim, dim_inner * 2), GEGLU(), nn.Dropout(dropout), nn.Linear(dim_inner, dim))
+
+
+class TransformerUNet(NeuralModule):
+    """
+    Implementation of the transformer Encoder Model with U-Net structure used in
+    VoiceBox and AudioBox
+
+    References:
+        Le et al., Voicebox: Text-Guided Multilingual Universal Speech Generation at Scale, 2023
+        Vyas et al., Audiobox: Unified Audio Generation with Natural Language Prompts, 2023
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        depth: int,
+        heads: int = 8,
+        ff_mult: int = 4,
+        attn_dropout: float = 0.0,
+        ff_dropout: float = 0.0,
+        max_positions: int = 6000,
+        adaptive_rmsnorm: bool = False,
+        adaptive_rmsnorm_cond_dim_in: Optional[int] = None,
+        use_unet_skip_connection: bool = True,
+        skip_connect_scale: Optional[int] = None,
+    ):
+        """
+        Args:
+            dim: Embedding dimension
+            depth: Number of Transformer Encoder Layers
+            heads: Number of heads in MHA
+            ff_mult: The multiplier for the feedforward dimension (ff_dim = ff_mult * dim)
+            attn_dropout: dropout rate for the MHA layer
+            ff_dropout: droupout rate for the feedforward layer
+            max_positions: The maximum time length of the input during training and inference
+            adaptive_rmsnorm: Whether to use AdaptiveRMS layer.
+                Set to True if the model has a conditional embedding in forward()
+            adaptive_rms_cond_dim_in: Dimension of the conditional embedding
+            use_unet_skip_connection: Whether to use U-Net or not
+            skip_connect_scale: The scale of the U-Net connection.
+        """
+        super().__init__()
+        if (depth % 2) != 0:
+            raise ValueError(f"Number of layers {depth} is not divisible by 2!")
+        self.layers = nn.ModuleList([])
+        self.init_alibi(max_positions=max_positions, heads=heads)
+
+        if adaptive_rmsnorm:
+            rmsnorm_class = partial(AdaptiveRMSNorm, cond_dim=adaptive_rmsnorm_cond_dim_in)
+        else:
+            rmsnorm_class = RMSNorm
+
+        if skip_connect_scale is None:
+            self.skip_connect_scale = 2**-0.5
+        else:
+            self.skip_connect_scale = skip_connect_scale
+
+        for ind in range(depth):
+            layer = ind + 1
+            has_skip = use_unet_skip_connection and layer > (depth // 2)
+
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        nn.Linear(dim * 2, dim) if has_skip else None,
+                        rmsnorm_class(dim=dim),
+                        nn.MultiheadAttention(
+                            embed_dim=dim,
+                            num_heads=heads,
+                            dropout=attn_dropout,
+                            batch_first=True,
+                        ),
+                        rmsnorm_class(dim=dim),
+                        get_feedforward_layer(dim=dim, mult=ff_mult, dropout=ff_dropout),
+                    ]
+                )
+            )
+
+        self.final_norm = RMSNorm(dim)
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tembedding dim:       %s', dim)
+        logging.debug('\tNumber of Layer:     %s', depth)
+        logging.debug('\tfeedforward dim:     %s', dim * ff_mult)
+        logging.debug('\tnumber of heads:     %s', heads)
+        logging.debug('\tDropout rate of MHA: %s', attn_dropout)
+        logging.debug('\tDropout rate of FF:  %s', ff_dropout)
+        logging.debug('\tnumber of heads:     %s', heads)
+        logging.debug('\tmaximun time length: %s', max_positions)
+        logging.debug('\tuse AdaptiveRMS:     %s', adaptive_rmsnorm)
+        logging.debug('\tConditional  dim:    %s', adaptive_rmsnorm_cond_dim_in)
+        logging.debug('\tUse UNet connection: %s', use_unet_skip_connection)
+        logging.debug('\tskip connect scale:  %s', self.skip_connect_scale)
+
+    def init_alibi(
+        self,
+        max_positions: int,
+        heads: int,
+    ):
+        """Initialize the Alibi bias parameters
+
+        References:
+          - Press et al., Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation, 2021
+        """
+
+        def get_slopes(n):
+            ratio = 2 ** (-8 / n)
+            return ratio ** torch.arange(1, n + 1)
+
+        if not math.log2(heads).is_integer():
+            logging.warning(
+                "It is recommend to set number of attention heads to be the power of 2 for the Alibi bias!"
+            )
+            logging.warning(f"Current value of heads: {heads}")
+
+        self.slopes = nn.Parameter(einops.rearrange(get_slopes(heads), "B -> B 1 1"))
+
+        pos_matrix = (
+            -1 * torch.abs(torch.arange(max_positions).unsqueeze(0) - torch.arange(max_positions).unsqueeze(1)).float()
+        )
+        pos_matrix = einops.rearrange(pos_matrix, "T1 T2 -> 1 T1 T2")
+        self.register_buffer('pos_matrix', pos_matrix, persistent=False)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports."""
+        return {
+            "x": NeuralType(('B', 'T', 'D'), FloatType()),
+            "key_padding_mask": NeuralType(('B', 'T'), BoolType(), optional=True),
+            "adaptive_rmsnorm_cond": NeuralType(('B', 'D'), FloatType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports."""
+        return {
+            "output": NeuralType(('B', 'T', 'D'), FloatType()),
+        }
+
+    @typecheck()
+    def forward(self, x, key_padding_mask: Optional[torch.Tensor] = None, adaptive_rmsnorm_cond=None):
+        """Forward pass of the model.
+
+        Args:
+            input: input tensor, shape (B, C, D, T)
+            key_padding_mask: mask tensor indicating the padding parts, shape (B, T)
+            adaptive_rmsnorm_cond: conditional input for the model, shape (B, D)
+        """
+        batch_size, seq_len, *_ = x.shape
+        skip_connects = []
+        alibi_bias = self.get_alibi_bias(batch_size=batch_size, seq_len=seq_len)
+
+        rmsnorm_kwargs = dict()
+        if adaptive_rmsnorm_cond is not None:
+            rmsnorm_kwargs = dict(cond=adaptive_rmsnorm_cond)
+
+        for skip_combiner, attn_prenorm, attn, ff_prenorm, ff in self.layers:
+
+            if skip_combiner is None:
+                skip_connects.append(x)
+            else:
+                skip_connect = skip_connects.pop() * self.skip_connect_scale
+                x = torch.cat((x, skip_connect), dim=-1)
+                x = skip_combiner(x)
+
+            attn_input = attn_prenorm(x, **rmsnorm_kwargs)
+            if key_padding_mask is not None:
+                # Since Alibi_bias is a float-type attn_mask, the padding_mask need to be float-type.
+                float_key_padding_mask = key_padding_mask.float()
+                float_key_padding_mask = float_key_padding_mask.masked_fill(key_padding_mask, float('-inf'))
+            else:
+                float_key_padding_mask = None
+
+            attn_output, _ = attn(
+                query=attn_input,
+                key=attn_input,
+                value=attn_input,
+                key_padding_mask=float_key_padding_mask,
+                need_weights=False,
+                attn_mask=alibi_bias,
+            )
+            x = x + attn_output
+
+            ff_input = ff_prenorm(x, **rmsnorm_kwargs)
+            x = ff(ff_input) + x
+
+        return self.final_norm(x)
+
+    def get_alibi_bias(self, batch_size: int, seq_len: int):
+        """
+        Return the alibi_bias given batch size and seqence length
+        """
+        pos_matrix = self.pos_matrix[:, :seq_len, :seq_len]
+        alibi_bias = pos_matrix * self.slopes
+        alibi_bias = alibi_bias.repeat(batch_size, 1, 1)
+
+        return alibi_bias
+
+
+class SpectrogramTransformerUNet(NeuralModule):
+    """This model handles complex-valued inputs by stacking real and imaginary components.
+    Stacked tensor is processed using TransformerUNet and the output is projected to generate real
+    and imaginary components of the output channels.
+
+    Convolutional Positional Embedding is applied for the input sequence
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 1,
+        out_channels: int = 1,
+        freq_dim: int = 256,
+        dim: int = 1024,
+        depth: int = 24,
+        heads: int = 16,
+        ff_mult: int = 4,
+        ff_dropout: float = 0.0,
+        attn_dropout: float = 0.0,
+        max_positions: int = 6000,
+        time_hidden_dim: Optional[int] = None,
+        conv_pos_embed_kernel_size: int = 31,
+        conv_pos_embed_groups: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        dim_in = freq_dim * in_channels * 2
+
+        if time_hidden_dim is None:
+            time_hidden_dim = dim * 4
+
+        self.proj_in = nn.Linear(dim_in, dim)
+
+        self.sinu_pos_emb = nn.Sequential(LearnedSinusoidalPosEmb(dim), nn.Linear(dim, time_hidden_dim), nn.SiLU())
+
+        self.conv_embed = ConvPositionEmbed(
+            dim=dim, kernel_size=conv_pos_embed_kernel_size, groups=conv_pos_embed_groups
+        )
+
+        self.transformerunet = TransformerUNet(
+            dim=dim,
+            depth=depth,
+            heads=heads,
+            ff_mult=ff_mult,
+            ff_dropout=ff_dropout,
+            attn_dropout=attn_dropout,
+            max_positions=max_positions,
+            adaptive_rmsnorm=True,
+            adaptive_rmsnorm_cond_dim_in=time_hidden_dim,
+            use_unet_skip_connection=True,
+        )
+
+        # 2x the frequency dimension as the model operates in the complex-value domain
+        dim_out = freq_dim * out_channels * 2
+
+        self.proj_out = nn.Linear(dim, dim_out)
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tin_channels:  %s', self.in_channels)
+        logging.debug('\tout_channels: %s', self.out_channels)
+        logging.debug('\tInput frequency dimension: %s', freq_dim)
+
+    @property
+    def input_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports."""
+        return {
+            "input": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "input_length": NeuralType(('B',), LengthsType(), optional=True),
+            "condition": NeuralType(('B',), FloatType(), optional=True),
+        }
+
+    @property
+    def output_types(self) -> Dict[str, NeuralType]:
+        """Returns definitions of module output ports."""
+        return {
+            "output": NeuralType(('B', 'C', 'D', 'T'), SpectrogramType()),
+            "output_length": NeuralType(('B',), LengthsType(), optional=True),
+        }
+
+    @staticmethod
+    def _get_key_padding_mask(input_length: torch.Tensor, max_length: int):
+        """
+        Return the self_attention masking according to the input length.
+        0 indicates the frame is in the valid range, while 1 indicates the frame is a padding frame.
+        Args:
+          input_length: shape (B)
+          max_length (int): The maximum length of the input sequence
+
+        return:
+          key_padding_mask: shape (B, T)
+        """
+        key_padding_mask = torch.arange(max_length).expand(len(input_length), max_length).to(input_length.device)
+        key_padding_mask = key_padding_mask >= input_length.unsqueeze(1)
+        return key_padding_mask
+
+    @typecheck()
+    def forward(self, input, input_length=None, condition=None):
+        """Forward pass of the model.
+
+        Args:
+            input: input tensor, shape (B, C, D, T)
+            input_length: length of the valid time steps for each example in the batch, shape (B,)
+            condition: scalar condition (time) for the model, will be embedded using `self.time_embedding`
+        """
+        # Stack real and imaginary components
+        B, C_in, D, T = input.shape
+        if C_in != self.in_channels:
+            raise RuntimeError(f'Unexpected input channel size {C_in}, expected {self.in_channels}')
+
+        input_real_imag = torch.stack([input.real, input.imag], dim=2)
+        input = einops.rearrange(input_real_imag, 'B C RI D T -> B T (C RI D)')
+
+        x = self.proj_in(input)
+        key_padding_mask = self._get_key_padding_mask(input_length, max_length=T)
+        x = self.conv_embed(x, mask=key_padding_mask) + x
+
+        if condition is None:
+            raise NotImplementedError
+
+        time_emb = self.sinu_pos_emb(condition)
+
+        x = self.transformerunet(x=x, key_padding_mask=key_padding_mask, adaptive_rmsnorm_cond=time_emb)
+
+        output = self.proj_out(x)
+        output = einops.rearrange(output, "B T (C RI D) -> B C D T RI", C=self.out_channels, RI=2, D=D)
+        output = torch.view_as_complex(output.contiguous())
+
+        return output, input_length
diff --git a/nemo/collections/audio/parts/utils/callbacks.py b/nemo/collections/audio/parts/utils/callbacks.py
new file mode 100644
index 000000000000..093d5a11f419
--- /dev/null
+++ b/nemo/collections/audio/parts/utils/callbacks.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Type
+
+import einops
+import torch
+from pytorch_lightning import Callback, LightningModule, Trainer
+from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.loggers.logger import Logger
+from pytorch_lightning.loggers.wandb import WandbLogger
+
+from nemo.utils import logging
+from nemo.utils.decorators import experimental
+
+HAVE_WANDB = True
+try:
+    import wandb
+except ModuleNotFoundError:
+    HAVE_WANDB = False
+
+
+def _get_logger(loggers: List[Logger], logger_type: Type[Logger]):
+    for logger in loggers:
+        if isinstance(logger, logger_type):
+            if hasattr(logger, "experiment"):
+                return logger.experiment
+            else:
+                return logger
+    raise ValueError(f"Could not find {logger_type} logger in {loggers}.")
+
+
+@experimental
+class SpeechEnhancementLoggingCallback(Callback):
+    """
+    Callback which can log artifacts (eg. model predictions, graphs) to local disk, Tensorboard, and/or WandB.
+
+    Args:
+        data_loader: Data to log artifacts for.
+        output_dir: Optional local directory. If provided, artifacts will be saved in output_dir.
+        loggers: Optional list of loggers to use if logging to tensorboard or wandb.
+        log_tensorboard: Whether to log artifacts to tensorboard.
+        log_wandb: Whether to log artifacts to WandB.
+    """
+
+    def __init__(
+        self,
+        data_loader,
+        data_loader_idx: int,
+        loggers: Optional[List[Logger]] = None,
+        log_tensorboard: bool = False,
+        log_wandb: bool = False,
+        sample_rate: int = 16000,
+        max_utts: Optional[int] = None,
+    ):
+        self.data_loader = data_loader
+        self.data_loader_idx = data_loader_idx
+        self.loggers = loggers if loggers else []
+        self.log_tensorboard = log_tensorboard
+        self.log_wandb = log_wandb
+        self.sample_rate = sample_rate
+        self.max_utts = max_utts
+
+        if log_tensorboard:
+            logging.info('Creating tensorboard logger')
+            self.tensorboard_logger = _get_logger(self.loggers, TensorBoardLogger)
+        else:
+            logging.debug('Not using tensorbord logger')
+            self.tensorboard_logger = None
+
+        if log_wandb:
+            if not HAVE_WANDB:
+                raise ValueError("Wandb not installed.")
+            logging.info('Creating wandb logger')
+            self.wandb_logger = _get_logger(self.loggers, WandbLogger)
+        else:
+            logging.debug('Not using wandb logger')
+            self.wandb_logger = None
+
+        logging.debug('Initialized %s with', self.__class__.__name__)
+        logging.debug('\tlog_tensorboard: %s', self.log_tensorboard)
+        logging.debug('\tlog_wandb:       %s', self.log_wandb)
+
+    def _log_audio(self, audios: torch.Tensor, lengths: torch.Tensor, step: int, label: str = "input"):
+
+        num_utts = audios.size(0)
+        for audio_idx in range(num_utts):
+            length = lengths[audio_idx]
+            if self.tensorboard_logger:
+                self.tensorboard_logger.add_audio(
+                    tag=f"{label}_{audio_idx}",
+                    snd_tensor=audios[audio_idx, :length],
+                    global_step=step,
+                    sample_rate=self.sample_rate,
+                )
+
+            if self.wandb_logger:
+                wandb_audio = (
+                    wandb.Audio(audios[audio_idx], sample_rate=self.sample_rate, caption=f"{label}_{audio_idx}"),
+                )
+                self.wandb_logger.log({f"{label}_{audio_idx}": wandb_audio})
+
+    def on_validation_epoch_end(self, trainer: Trainer, model: LightningModule):
+        """Log artifacts at the end of an epoch."""
+        epoch = 1 + model.current_epoch
+        output_signal_list = []
+        output_length_list = []
+        num_examples_uploaded = 0
+
+        logging.info(f"Logging processed speech for validation dataset {self.data_loader_idx}...")
+        for batch in self.data_loader:
+            if isinstance(batch, dict):
+                # lhotse batches are dictionaries
+                input_signal = batch['input_signal']
+                input_length = batch['input_length']
+                target_signal = batch.get('target_signal', input_signal.clone())
+            else:
+                input_signal, input_length, target_signal, _ = batch
+
+            if self.max_utts is None:
+                num_examples = input_signal.size(0)  # batch size
+                do_upload = True
+            else:
+                do_upload = num_examples_uploaded < self.max_utts
+                num_examples = min(self.max_utts - num_examples_uploaded, input_signal.size(0))
+                num_examples_uploaded += num_examples
+
+            if do_upload:
+                # Only pick the required numbers of speech to the logger
+                input_signal = input_signal[:num_examples, ...]
+                target_signal = target_signal[:num_examples, ...]
+                input_length = input_length[:num_examples]
+
+                # For consistency, the model uses multi-channel format, even if the channel dimension is 1
+                if input_signal.ndim == 2:
+                    input_signal = einops.rearrange(input_signal, 'B T -> B 1 T')
+                if target_signal.ndim == 2:
+                    target_signal = einops.rearrange(target_signal, 'B T -> B 1 T')
+
+                input_signal = input_signal.to(model.device)
+                input_length = input_length.to(model.device)
+
+                output_signal, output_length = model(input_signal=input_signal, input_length=input_length)
+                output_signal_list.append(output_signal.to(target_signal.device))
+                output_length_list.append(output_length.to(target_signal.device))
+
+        if len(output_signal_list) == 0:
+            logging.debug('List are empty, no artifacts to log at epoch %d.', epoch)
+            return
+
+        output_signals = torch.concat(output_signal_list, dim=0)
+        output_lengths = torch.concat(output_length_list, dim=0)
+        if output_signals.size(1) != 1:
+            logging.error(
+                f"Currently only supports single-channel audio! Current output shape: {output_signals.shape}"
+            )
+            raise NotImplementedError
+
+        output_signals = einops.rearrange(output_signals, "B 1 T -> B T")
+
+        self._log_audio(
+            audios=output_signals,
+            lengths=output_lengths,
+            step=model.global_step,
+            label=f"dataloader_{self.data_loader_idx}_processed",
+        )

From 04ca8310f6b0dff0acceec4ca76c33b4abb77580 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 26 Aug 2024 00:10:56 -0700
Subject: [PATCH 10/31] Revert torchrun fix for model import (#10251)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/lightning/io/mixin.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index eff4cd9434ce..e249e2e318b6 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -300,13 +300,8 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
         """
         connector = self._get_connector(path)
         ckpt_path: Path = connector.local_path(base_path=base_path)
-        # If already in multiproc environment (e.g. due to torchrun invocation) run only on RANK = 0
-        from nemo.utils.get_rank import is_global_rank_zero
-
-        if is_global_rank_zero():
-            ckpt_path = connector(ckpt_path, overwrite=overwrite)
-            connector.on_import_ckpt(self)
-
+        ckpt_path = connector(ckpt_path, overwrite=overwrite)
+        connector.on_import_ckpt(self)
         return ckpt_path
 
     @classmethod

From 7a8c0e85dd3964fa775d7c5ccb872105db206d2b Mon Sep 17 00:00:00 2001
From: Marc Romeyn <mromeijn@nvidia.com>
Date: Mon, 26 Aug 2024 18:27:00 +0200
Subject: [PATCH 11/31] [NeMo-UX[ Move nemotron imports inline (#10255)

* Move nemotron transformers + tokenizer imports inline to reduce number of required deps

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>

---------

Signed-off-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/nemotron.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py
index dd659f7eedf7..d946e5f48cce 100644
--- a/nemo/collections/llm/gpt/model/nemotron.py
+++ b/nemo/collections/llm/gpt/model/nemotron.py
@@ -4,16 +4,17 @@
 
 import torch
 from torch import nn
-from transformers import NemotronConfig as HFNemotronConfig
-from transformers import NemotronForCausalLM
 
-from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
 from nemo.collections.llm.fn.activation import squared_relu
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel
 from nemo.collections.llm.utils import Config
 from nemo.lightning import OptimizerModule, io, teardown
 
 if TYPE_CHECKING:
+    from transformers import NemotronConfig as HFNemotronConfig
+    from transformers import NemotronForCausalLM
+
+    from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
 
@@ -123,6 +124,8 @@ def init(self) -> NemotronModel:
         return NemotronModel(self.config, tokenizer=self.tokenizer)
 
     def apply(self, output_path: Path) -> Path:
+        from transformers import NemotronForCausalLM
+
         source = NemotronForCausalLM.from_pretrained(str(self))
         target = self.init()
         trainer = self.nemo_setup(target)
@@ -155,10 +158,14 @@ def convert_state(self, source, target):
 
     @property
     def tokenizer(self) -> "AutoTokenizer":
+        from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+
         return AutoTokenizer(str(self))
 
     @property
     def config(self) -> NemotronConfig:
+        from transformers import NemotronConfig as HFNemotronConfig
+
         source = HFNemotronConfig.from_pretrained(str(self))
 
         def make_vocab_size_divisible_by(vocab_size):
@@ -224,6 +231,8 @@ def tokenizer(self):
 
     @property
     def config(self) -> "HFNemotronConfig":
+        from transformers import NemotronConfig as HFNemotronConfig
+
         source: NemotronConfig = io.load_context(str(self)).model.config
 
         return HFNemotronConfig(

From ac5cb06cacd5c8f5b679be26f523fb7f9cf900e9 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:36:33 -0700
Subject: [PATCH 12/31] Wrap CPU model init with megatron_lazy_init_context
 (#10219)

* Wrap CPU model init with megatron_lazy_init_context

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Cleanup checkpoint-dir if saving fails

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 nemo/lightning/io/connector.py |  3 ++-
 nemo/lightning/io/pl.py        | 23 ++++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 69368599682e..512f3bc4f12e 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -145,6 +145,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
             pl.Trainer: The trainer configured with the model and strategy.
         """
         from nemo.lightning import MegatronStrategy, Trainer
+        from nemo.lightning._strategy_lib import megatron_lazy_init_context
 
         _trainer = trainer or Trainer(
             devices=1, accelerator="cpu", strategy=MegatronStrategy(store_optimizer_states=False)
@@ -155,7 +156,7 @@ def nemo_setup(self, model: pl.LightningModule, trainer: Optional[pl.Trainer] =
 
         if not model.state_dict():
             _trainer.strategy.lazy_init = True
-            with _trainer.init_module():
+            with _trainer.init_module(), megatron_lazy_init_context(model.config):
                 model.configure_model()
 
         return _trainer
diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py
index d0749fbeead7..f43d24792c1a 100644
--- a/nemo/lightning/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -126,13 +126,22 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
 
         validate_sharding_integrity = not (self.validated_consistency and self.assume_constant_structure)
         self.validated_consistency = True
-        return dist_checkpointing.save(
-            sharded_state_dict=checkpoint,
-            checkpoint_dir=checkpoint_dir,
-            sharded_strategy=self.save_sharded_strategy,
-            validate_access_integrity=validate_sharding_integrity,
-            async_sharded_save=self.async_save,
-        )
+
+        try:
+            return dist_checkpointing.save(
+                sharded_state_dict=checkpoint,
+                checkpoint_dir=checkpoint_dir,
+                sharded_strategy=self.save_sharded_strategy,
+                validate_access_integrity=validate_sharding_integrity,
+                async_sharded_save=self.async_save,
+            )
+        except:
+            logging.error(f"Failed to save checkpoint to {checkpoint_dir}")
+            # Do cleanup.
+            import shutil
+
+            shutil.rmtree(checkpoint_dir)
+            raise
 
     @override
     def load_checkpoint(

From 076f9ea57bfcbbcd7acb242eae281da102c7d79d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 27 Aug 2024 00:14:13 +0200
Subject: [PATCH 13/31] Bump `Dockerfile.ci` (2024-08-22) (#10227)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [🤠]: Howdy folks, let's bump `Dockerfile.ci` to 124bcff !

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>

* fix bert flags

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml | 12 ++++++------
 Dockerfile.ci                   |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index a086a493f683..396ef03bd661 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -2061,7 +2061,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
@@ -2091,7 +2091,7 @@ jobs:
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
@@ -2128,7 +2128,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
@@ -2159,7 +2159,7 @@ jobs:
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
         model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
         trainer.devices=2 \
         trainer.accelerator=gpu \
         trainer.log_every_n_steps=1 \
@@ -2199,7 +2199,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+        python examples/nlp/language_modeling/megatron_bert_pretraining.py \
           trainer.devices=2 \
           trainer.accelerator=gpu \
           trainer.log_every_n_steps=1 \
@@ -2229,7 +2229,7 @@ jobs:
           model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \
           model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings
 
-          NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \
+          python examples/nlp/language_modeling/megatron_bert_pretraining.py \
           trainer.devices=2 \
           trainer.accelerator=gpu \
           trainer.log_every_n_steps=1 \
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 38b82a288a2b..161671bf5a5a 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0
-ARG MCORE_TAG=2fd6e2b74efca73a1f2d27b89bb5419384b4d3bf
+ARG MCORE_TAG=124bcff2a8153eccea4d7d0e4df5c5562aab50b9
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \

From 596438700de72816ca8b8875af2b8694f04ceda7 Mon Sep 17 00:00:00 2001
From: Slyne Deng <slynedeng@gmail.com>
Date: Mon, 26 Aug 2024 16:35:42 -0700
Subject: [PATCH 14/31] salm export trtllm (#10245)

Signed-off-by: slyne deng <slyned@nvidia.com>
Co-authored-by: slyne deng <slyned@nvidia.com>
---
 .../multimodal/speech_llm/export/README.md    |  83 +++++
 .../speech_llm/export/conf/salm_export.yaml   |  16 +
 .../speech_llm/export/export_salm.py          |  39 +++
 .../speech_llm/export/extract_salm_weights.py | 204 ++++++++++++
 nemo/deploy/multimodal/query_multimodal.py    |  12 +-
 nemo/export/multimodal/build.py               | 120 +++++++-
 nemo/export/multimodal/run.py                 | 291 +++++++++++++++++-
 nemo/export/tensorrt_mm_exporter.py           |  58 +++-
 scripts/deploy/multimodal/deploy_triton.py    |  15 +-
 9 files changed, 810 insertions(+), 28 deletions(-)
 create mode 100644 examples/multimodal/speech_llm/export/README.md
 create mode 100644 examples/multimodal/speech_llm/export/conf/salm_export.yaml
 create mode 100644 examples/multimodal/speech_llm/export/export_salm.py
 create mode 100644 examples/multimodal/speech_llm/export/extract_salm_weights.py

diff --git a/examples/multimodal/speech_llm/export/README.md b/examples/multimodal/speech_llm/export/README.md
new file mode 100644
index 000000000000..05e44d112cce
--- /dev/null
+++ b/examples/multimodal/speech_llm/export/README.md
@@ -0,0 +1,83 @@
+## Setup
+In this part, we are going to export SALM model into TRTLLM.
+First, let's download the [SALM nemo model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/speechllm_fc_llama2_7b/) from NVIDIA ngc.
+
+```bash
+wget --content-disposition 'https://api.ngc.nvidia.com/v2/models/org/nvidia/team/nemo/speechllm_fc_llama2_7b/1.23.1/files?redirect=true&path=speechllm_fc_llama2_7b.nemo' -O speechllm_fc_llama2_7b.nemo
+```
+
+Then, we need to extract the different parts of SALM.
+```bash
+output=$PWD/output
+python3 extract_salm_weights.py --model_file_path=speechllm_fc_llama2_7b.nemo --output_dir=$output
+```
+It takes a while to run the above command.
+
+Under the `output` dir, you'll see:
+```
+output
+    |___speechllm_fc_llama2_7b_lora.nemo
+    |___speechllm_fc_llama2_7b_perception
+    |         |____model_config.yaml
+    |         |____model_weights.ckpt
+    |___speechllm_fc_llama2_7b_llm.nemo
+    |___ xxx.tokenizer.model
+```
+
+After we get the lora nemo model and llm nemo model, we can merge the lora part into the llm by:
+```bash
+python /opt/NeMo/scripts/nlp_language_modeling/merge_lora_weights/merge.py \
+    trainer.accelerator=gpu \
+    tensor_model_parallel_size=1 \
+    pipeline_model_parallel_size=1 \
+    gpt_model_file=output/speechllm_fc_llama2_7b_llm.nemo \
+    lora_model_path=output/speechllm_fc_llama2_7b_lora.nemo \
+    merged_model_path=speechllm_fc_llama2_7b_llm_merged.nemo
+```
+
+Now we are able to export the engine by:
+```bash
+python3 export_salm.py \
+    model.perception_model_path=output/speechllm_fc_llama2_7b_perception \
+    model.llm_model_path=output/speechllm_fc_llama2_7b_llm_merged.nemo
+```
+
+You should be able to get the generated engines under `./salm` folder. To run the engines, you may run:
+```python
+from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
+
+output_dir = "/ws/salm" # the engine directory
+trt_llm_exporter = TensorRTMMExporter(model_dir=output_dir, load_model=True, modality='audio')
+input_text = "Q: what's the transcription of the audio? A:"
+input_media = '/ws/data/test_audio.wav'
+print(trt_llm_exporter.forward(input_text, input_media))
+
+```
+
+## Deploy
+If you want to generate the engines and deploy them with Triton Inference Server, you may also run:
+
+```bash
+python3 NeMo/scripts/deploy/multimodal/deploy_triton.py \
+        --modality="audio" \
+        --visual_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_perception \
+        --llm_checkpoint=NeMo/examples/multimodal/speech_llm/export/output/speechllm_fc_llama2_7b_llm_merged.nemo \
+        --llm_model_type="llama" \
+        --model_type="salm" \
+        --triton_model_name="salm" \
+        --max_input_len=4096 \
+        --max_output_len=256 \
+        --max_multimodal_len=3072 \
+        --triton_model_repository=/tmp/trt_model_dir/
+```
+
+And on client side, you may run:
+```bash
+python3 NeMo/scripts/deploy/multimodal/query.py \
+        --model_name="salm" \
+        --model_type="salm" \
+        --input_text="Q: what's the transcription of the audio? A:" \
+        --input_media=/ws/data/test_audio.wav
+```
+
+For more details, please check `NeMo/scripts/deploy/multimodal/deploy_triton.py` and ` NeMo/scripts/deploy/multimodal/query.py`.
\ No newline at end of file
diff --git a/examples/multimodal/speech_llm/export/conf/salm_export.yaml b/examples/multimodal/speech_llm/export/conf/salm_export.yaml
new file mode 100644
index 000000000000..54ab6e9180c5
--- /dev/null
+++ b/examples/multimodal/speech_llm/export/conf/salm_export.yaml
@@ -0,0 +1,16 @@
+name: speechllm_salm
+infer:
+  output_dir: ./salm
+  max_batch_size: 1
+  tensor_parallelism: 1
+  max_input_len: 4096
+  max_output_len: 256
+  max_multimodal_len: 3072
+  perception_max_batch_size: 1
+
+model:
+  type: salm
+  precision: float16
+  perception_model_path: /path/to/speechllm_llama2_7b_perception
+  llm_model_path: /path/to/speechllm_llama2_7b_llm.nemo
+  llm_model_type: llama 
diff --git a/examples/multimodal/speech_llm/export/export_salm.py b/examples/multimodal/speech_llm/export/export_salm.py
new file mode 100644
index 000000000000..00500bf46f50
--- /dev/null
+++ b/examples/multimodal/speech_llm/export/export_salm.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.core.config import hydra_runner
+from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
+
+
+@hydra_runner(config_path='conf', config_name='salm_export')
+def main(cfg):
+    exporter = TensorRTMMExporter(model_dir=cfg.infer.output_dir, load_model=False, modality='audio')
+    exporter.export(
+        visual_checkpoint_path=cfg.model.perception_model_path,
+        llm_checkpoint_path=cfg.model.llm_model_path,
+        model_type=cfg.model.type,
+        llm_model_type=cfg.model.llm_model_type,
+        tensor_parallel_size=cfg.infer.tensor_parallelism,
+        max_input_len=cfg.infer.max_input_len,
+        max_output_len=cfg.infer.max_output_len,
+        vision_max_batch_size=cfg.infer.perception_max_batch_size,
+        max_batch_size=cfg.infer.max_batch_size,
+        max_multimodal_len=cfg.infer.max_multimodal_len,
+        dtype=cfg.model.precision,
+        load_model=False,
+    )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/multimodal/speech_llm/export/extract_salm_weights.py b/examples/multimodal/speech_llm/export/extract_salm_weights.py
new file mode 100644
index 000000000000..0698a411110e
--- /dev/null
+++ b/examples/multimodal/speech_llm/export/extract_salm_weights.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import os
+import tempfile
+
+import torch
+from megatron.core import dist_checkpointing
+from omegaconf import OmegaConf
+from pytorch_lightning.trainer.trainer import Trainer
+
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector
+from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper
+from nemo.utils import logging
+from nemo.utils.model_utils import inject_model_parallel_rank
+
+
+def get_config_and_state_dict_from_nemo(filepath, map_location, output_dir, sharded_state_dict=None):
+    cwd = os.getcwd()
+    save_restore_connector = NLPSaveRestoreConnector()
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        try:
+            if os.path.isfile(filepath):
+                save_restore_connector._unpack_nemo_file(path2file=filepath, out_folder=tmpdir)
+            else:
+                tmpdir = filepath
+
+            os.chdir(tmpdir)
+            config_yaml = "model_config.yaml"
+            model_weights_ckpt = "model_weights.ckpt"
+
+            # find file in tmpdir that endswith "tokenizer.model"
+            tokenizer = None
+            for file in os.listdir(tmpdir):
+                if file.endswith("tokenizer.model"):
+                    tokenizer = file
+                    break
+            if tokenizer is None:
+                raise ValueError(f"Tokenizer not found in {tmpdir}")
+            tokenizer_path = os.path.join(tmpdir, tokenizer)
+            # copy tokenizer_path to current directory
+            os.system(f"cp {tokenizer_path} {output_dir}")
+            tokenizer_path = os.path.join(output_dir, tokenizer)
+
+            # load conf
+            with open(config_yaml) as f:
+                conf = OmegaConf.load(f)
+
+            os.chdir(cwd)
+            model_weights = os.path.join(tmpdir, model_weights_ckpt)
+            model_weights = inject_model_parallel_rank(model_weights)
+            state_dict = save_restore_connector._load_state_dict_from_disk(model_weights, map_location=map_location)
+
+            # distributed checkpointing
+            if state_dict is None and sharded_state_dict is not None:
+                checkpoint = dict(state_dict=sharded_state_dict)
+                tmp_model_weights_ckpt = os.path.join(tmpdir, save_restore_connector.model_weights_ckpt)
+                tmp_model_weights_dir = os.path.splitext(tmp_model_weights_ckpt)[0]
+                assert os.path.isdir(tmp_model_weights_dir), f'Expected {tmp_model_weights_dir} to be a directory.'
+                checkpoint = dist_checkpointing.load(
+                    sharded_state_dict=checkpoint,
+                    checkpoint_dir=tmp_model_weights_dir,
+                )
+                state_dict = checkpoint["state_dict"]
+
+            conf.tokenizer.model = tokenizer_path
+            return conf, state_dict
+        finally:
+            os.chdir(cwd)
+
+
+def get_llm_model_state_dict(state_dict, lora_model_state_dict):
+    llm_model_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith("model."):
+            if key not in lora_model_state_dict and value != None:
+                llm_model_state_dict[key] = value
+    return llm_model_state_dict
+
+
+def get_lora_state_dict(state_dict):
+    lora_model_state_dict = {}
+    for key, value in state_dict.items():
+        if "adapter_layer.lora" in key and value != None:
+            lora_model_state_dict[key] = value
+    return lora_model_state_dict
+
+
+def get_perception_state_dict(state_dict):
+    perception_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith("perception."):
+            key = key.replace("perception.", "", 1)
+            perception_state_dict[key] = value
+    return perception_state_dict
+
+
+def save_llm_model(state_dict, nemo_config, output_path):
+    if nemo_config.get('megatron_amp_O2', False):
+        keys = list(state_dict.keys())
+        for key in keys:
+            state_dict[key.replace('model.', 'model.module.', 1)] = state_dict['state_dict'].pop(key)
+
+    trainer = Trainer(accelerator='cpu', strategy=NLPDDPStrategy())
+    model = load_state_dict_helper(MegatronGPTModel, nemo_config, trainer, state_dict)
+    model._save_restore_connector = NLPSaveRestoreConnector()
+    model.cfg.use_cpu_initialization = False
+
+    model.save_to(output_path)
+    logging.info(f'llm model saved to: {output_path}')
+
+
+def save_nemo_weights(state_dict, output_dir, config, save_nemo_model=True):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    weight_file = os.path.join(output_dir, "model_weights.ckpt")
+    torch.save(state_dict, weight_file)
+    # convert config to yaml
+    config_file = os.path.join(output_dir, "model_config.yaml")
+    with open(config_file, "w") as f:
+        f.write(OmegaConf.to_yaml(config))
+
+    if save_nemo_model:
+        # create nemo file
+        nemo_model_name = f"{output_dir}.nemo"
+        nemo_path = os.path.join(output_dir, nemo_model_name)
+        # tar model_config.yaml and model_weights.ckpt
+        os.system(f"tar -C {output_dir} -cvf {nemo_path} model_config.yaml model_weights.ckpt")
+        # remove model_config.yaml and model_weights.ckpt
+        os.system(f"rm {config_file} {weight_file}")
+        # remove the empty directory
+        os.system(f"rmdir {output_dir}")
+
+
+def separate_speechllm_model(model_file_path, output_dir, map_location="cuda:0"):
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    output_dir = os.path.abspath(output_dir)
+
+    logging.info(f"Separating {model_file_path} into perception, lora, and llm model")
+    filepath = model_file_path
+    conf, state_dict = get_config_and_state_dict_from_nemo(filepath, map_location, output_dir)
+
+    base_model_name = os.path.basename(filepath).split(".")[0]
+
+    perception_state_dict = get_perception_state_dict(state_dict)
+    perception_model_dir = None
+    if perception_state_dict:
+        perception_model_dir = f"{base_model_name}_perception"
+        perception_model_dir = os.path.join(output_dir, perception_model_dir)
+        save_nemo_weights(perception_state_dict, perception_model_dir, conf.perception, save_nemo_model=False)
+
+        # verify if the exported perception model is correct
+        perception = AudioPerceptionModule(cfg=conf.perception)
+        perception.load_state_dict(perception_state_dict)
+        perception.eval()
+        print(perception)
+        print(perception(input_signal=torch.randn(1, 1000), input_signal_length=torch.tensor([1000])))
+    # absolute path of perception model
+    logging.info(f"Perception model saved to:  {perception_model_dir}")
+
+    lora_model_weights = get_lora_state_dict(state_dict)
+    lora_model_dir = None
+    if lora_model_weights:
+        lora_model_dir = f"{base_model_name}_lora"
+        lora_model_dir = os.path.join(output_dir, lora_model_dir)
+        save_nemo_weights(lora_model_weights, lora_model_dir, conf)
+        logging.info(f"Lora model saved to: {lora_model_dir}.nemo")
+    # hard code the target model for now
+    llm_model_weights = get_llm_model_state_dict(state_dict, lora_model_weights)
+    if llm_model_weights:
+        llm_model = f"{base_model_name}_llm.nemo"
+        llm_model = os.path.join(output_dir, llm_model)
+        conf.target = "nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel"
+        save_llm_model(llm_model_weights, conf, llm_model)
+        logging.info(f"LLM model saved to: {llm_model}")
+
+
+# filepath = "/ws/speechllm_fc_llama2_7b.nemo"
+# output_dir = "/ws/speechllm_fc_llama2_7b_separated"
+# perception_model_dir, lora_model, llm_model = separate_speechllm_model(filepath, output_dir)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Separate speechllm model')
+    parser.add_argument('--model_file_path', type=str, help='Path to the speechllm model')
+    parser.add_argument('--output_dir', type=str, help='Output directory to save the separated models')
+    args = parser.parse_args()
+    separate_speechllm_model(args.model_file_path, args.output_dir)
diff --git a/nemo/deploy/multimodal/query_multimodal.py b/nemo/deploy/multimodal/query_multimodal.py
index 1c01c6861048..63e6a3e8c3a6 100644
--- a/nemo/deploy/multimodal/query_multimodal.py
+++ b/nemo/deploy/multimodal/query_multimodal.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import soundfile as sf
 from PIL import Image
 
 from nemo.deploy.utils import str_list2numpy
@@ -71,6 +72,11 @@ def setup_media(self, input_media):
         elif self.model_type == "neva" or self.model_type == "vila":
             media = Image.open(input_media).convert('RGB')
             return np.expand_dims(np.array(media), axis=0)
+        elif self.model_type == "salm":
+            waveform, sample_rate = sf.read(input_media, dtype=np.float32)
+            input_signal = np.array([waveform], dtype=np.float32)
+            input_signal_length = np.array([[len(waveform)]], dtype=np.int32)
+            return {"input_signal": input_signal, "input_signal_length": input_signal_length}
         else:
             raise RuntimeError(f"Invalid model type {self.model_type}")
 
@@ -105,8 +111,10 @@ def query(
         inputs = {"input_text": prompts}
 
         media = self.setup_media(input_media)
-
-        inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0)
+        if isinstance(media, dict):
+            inputs.update(media)
+        else:
+            inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0)
 
         if batch_size is not None:
             inputs["batch_size"] = np.full(prompts.shape, batch_size, dtype=np.int_)
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
index 8ee3fa1c05e7..53c598be47c6 100644
--- a/nemo/export/multimodal/build.py
+++ b/nemo/export/multimodal/build.py
@@ -23,9 +23,12 @@
 import tensorrt as trt
 import torch
 import yaml
+from omegaconf import OmegaConf
 from tensorrt_llm.builder import Builder
 from transformers import AutoModel
 
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule
+from nemo.core.classes.common import typecheck
 from nemo.export.tensorrt_llm import TensorRTLLM
 from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_nemo_model
 
@@ -76,6 +79,32 @@ def export_visual_wrapper_onnx(
     )
 
 
+def export_perception_wrapper_onnx(
+    perception_wrapper,
+    input,
+    output_dir,
+    input_names=['processed_signal', 'processed_signal_length'],
+    output_names=['encoded', 'encoded_length'],
+    dynamic_axes={
+        'processed_signal': {0: 'batch', 2: 'time'},
+        'processed_signal_length': {0: 'batch'},
+        'encoded': {0: 'batch', 1: 'time'},
+        'encoded_length': {0: 'batch'},
+    },
+):
+    logger.log(trt.Logger.INFO, "Exporting onnx")
+    os.makedirs(f'{output_dir}/onnx', exist_ok=True)
+    torch.onnx.export(
+        perception_wrapper,
+        input,
+        f'{output_dir}/onnx/perception_encoder.onnx',
+        opset_version=17,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+    )
+
+
 def build_trt_engine(
     model_type,
     input_sizes,
@@ -85,8 +114,8 @@ def build_trt_engine(
     image_size=None,
     num_frames=None,
     nemo_config=None,
+    part_name='visual_encoder',
 ):
-    part_name = 'visual_encoder'
     onnx_file = '%s/onnx/%s.onnx' % (output_dir, part_name)
     engine_file = '%s/%s.engine' % (output_dir, part_name)
     config_file = '%s/%s' % (output_dir, "config.json")
@@ -131,6 +160,10 @@ def build_trt_engine(
 
     # input sizes can be a list of ints (e.g., [3, H, W]) when inputs are images,
     # or a list of three int lists (e.g., [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]]).
+    # or a list of three list of lists
+    # (e.g., [{input1: min_shape, input2: min_shape, }, \
+    #     {input1: opt_shape, input2: opt_shape}, \
+    # {input1: max_shape, input2: max_shape}] )
     assert isinstance(input_sizes, list), "input_sizes must be a list"
     if isinstance(input_sizes[0], int):
         logger.log(trt.Logger.INFO, f"Processed input sizes {input_sizes}")
@@ -139,10 +172,23 @@ def build_trt_engine(
     elif len(input_sizes) == 3 and isinstance(input_sizes[0], list):
         min_size, opt_size, max_size = input_sizes
         logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {min_size}/{opt_size}/{max_size}")
+    elif len(input_sizes) == 3 and isinstance(input_sizes[0], dict):
+        logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {input_sizes}")
     else:
         raise ValueError(f"invalid input sizes: {input_sizes}")
 
-    profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size], [nMaxBS, *max_size])
+    if isinstance(input_sizes[0], dict):
+        for i in range(network.num_inputs):
+            inputT = network.get_input(i)
+            input_name = inputT.name
+            min_size = input_sizes[0][input_name]
+            opt_size = input_sizes[1][input_name]
+            max_size = input_sizes[2][input_name]
+            logger.log(trt.Logger.INFO, f"{input_name} min/opt/max input sizes {min_size}/{opt_size}/{max_size}")
+            profile.set_shape(input_name, min_size, opt_size, max_size)
+    else:
+        profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size], [nMaxBS, *max_size])
+
     config.add_optimization_profile(profile)
 
     t0 = time()
@@ -367,6 +413,76 @@ def forward(self, images):
     )
 
 
+def build_perception_engine(
+    model_dir: str,
+    perception_checkpoint_path: str,
+    model_type: str = "salm",
+    max_batch_size: int = 1,
+):
+    assert model_type == "salm", f"Invalid model type {model_type}"
+
+    def load_perception_model(perception_checkpoint_path):
+        weights = "model_weights.ckpt"
+        perception_state_dict = torch.load(os.path.join(perception_checkpoint_path, weights))
+        config = "model_config.yaml"
+        config = OmegaConf.load(os.path.join(perception_checkpoint_path, config))
+        perception = AudioPerceptionModule(cfg=config)
+        perception.load_state_dict(perception_state_dict)
+        perception.eval()
+        return perception
+
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    # load perception model
+    perception_model = load_perception_model(perception_checkpoint_path)
+    feature_extractor = perception_model.preprocessor
+    input_signal = torch.randn(1, 1000, dtype=torch.float32)
+    input_signal_length = torch.tensor([1000], dtype=torch.int32)
+
+    processed_signal, processed_signal_length = feature_extractor(
+        input_signal=input_signal, length=input_signal_length
+    )
+    processed_signal_length = processed_signal_length.to(torch.int32)
+    dump_path = model_dir + "/feature_extractor.ts"  # dump the feature extractor as torchscript
+    feature_extractor.export(dump_path, (input_signal, input_signal_length))
+
+    class PerceptionWrapper(torch.nn.Module):
+        def __init__(self, encoder, modality_adapter, proj):
+            super().__init__()
+            self.encoder = encoder
+            self.modality_adapter = modality_adapter
+            self.proj = proj
+
+        @typecheck.disable_checks()
+        def forward(self, processed_signal, processed_signal_length):
+            encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+            encoded, encoded_len = self.modality_adapter(audio_signal=encoded, length=encoded_len)
+            # b, c, t -> b, t, c
+            encoded = self.proj(encoded.transpose(1, 2))
+            encoded_len = encoded_len.to(torch.int32)
+            return encoded, encoded_len
+
+    perception = PerceptionWrapper(perception_model.encoder, perception_model.modality_adapter, perception_model.proj)
+    export_perception_wrapper_onnx(perception, (processed_signal, processed_signal_length), model_dir)
+    # export the onnx perception model to tensorrt engine
+    # 512 -> 5.12 sec, 3072 -> 30.72 sec
+    opt_batch_size = max(1, max_batch_size // 2)
+    shapes = [
+        {"processed_signal": [1, 80, 64], "processed_signal_length": [1]},
+        {"processed_signal": [opt_batch_size, 80, 512], "processed_signal_length": [opt_batch_size]},
+        {"processed_signal": [max_batch_size, 80, 3072], "processed_signal_length": [max_batch_size]},
+    ]
+    build_trt_engine(
+        model_type,
+        shapes,
+        model_dir,
+        max_batch_size,
+        dtype=torch.float16,
+        nemo_config=None,
+        part_name='perception_encoder',
+    )
+
+
 def build_visual_engine(
     model_dir: str,
     visual_checkpoint_path: str,
diff --git a/nemo/export/multimodal/run.py b/nemo/export/multimodal/run.py
index 149df995c77a..2cde46ca41fa 100644
--- a/nemo/export/multimodal/run.py
+++ b/nemo/export/multimodal/run.py
@@ -25,6 +25,7 @@
 
 import einops
 import numpy as np
+import soundfile as sf
 import tensorrt as trt
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
@@ -32,7 +33,7 @@
 import yaml
 from PIL import Image
 from tensorrt_llm import logger
-from tensorrt_llm._utils import str_dtype_to_trt
+from tensorrt_llm._utils import str_dtype_to_trt, torch_dtype_to_trt
 from tensorrt_llm.runtime import ModelRunner, Session, TensorInfo
 from torch.nn import functional as F
 from torchvision import transforms
@@ -54,7 +55,8 @@ def trt_dtype_to_torch(dtype):
 
 class MultimodalModelRunner:
 
-    def __init__(self, visual_engine_dir, llm_engine_dir):
+    def __init__(self, visual_engine_dir, llm_engine_dir, modality='vision'):
+        self.modality = modality
         self.runtime_rank = tensorrt_llm.mpi_rank()
         device_id = self.runtime_rank % torch.cuda.device_count()
         torch.cuda.set_device(device_id)
@@ -68,13 +70,15 @@ def __init__(self, visual_engine_dir, llm_engine_dir):
             config = json.load(f)
         self.model_type = config['builder_config']['model_type']
         self.vision_precision = config['builder_config']['precision']
+        self.modality_precision = config['builder_config']['precision']
 
         self.num_frames = config['builder_config'].get('num_frames', None)
         self.image_size = config['builder_config'].get('image_size', None)
 
         self.profiling_iterations = 20
 
-        self.init_image_encoder(visual_engine_dir)
+        if modality == 'vision':
+            self.init_image_encoder(visual_engine_dir)
         self.init_tokenizer(llm_engine_dir)
         self.init_llm(llm_engine_dir)
         if self.model_type == 'lita' or self.model_type == 'vila' or self.model_type == 'vita':
@@ -242,10 +246,10 @@ def insert_tokens_by_index(self, input_ids, num_frames):
 
     def preprocess(self, warmup, pre_prompt, post_prompt, image, attention_mask, batch_size):
         if not warmup:
-            profiler.start("Vision")
+            profiler.start(self.modality.capitalize())
 
         if not warmup:
-            profiler.stop("Vision")
+            profiler.stop(self.modality.capitalize())
 
         if self.model_type == 'vila':
             visual_features, visual_atts = self.get_visual_features(image, attention_mask)
@@ -848,7 +852,7 @@ def print_result(self, input_text, output_text, batch_size, num_beams, run_profi
         if run_profiling:
             msec_per_batch = lambda name: 1000 * profiler.elapsed_time_in_sec(name) / self.profiling_iterations
             logger.info('Latencies per batch (msec)')
-            logger.info('TRT vision encoder: %.1f' % (msec_per_batch('Vision')))
+            logger.info(f'TRT {self.modality} encoder: %.1f' % (msec_per_batch(self.modality.capitalize())))
             logger.info('TRTLLM LLM generate: %.1f' % (msec_per_batch('LLM')))
             logger.info('Multimodal generate: %.1f' % (msec_per_batch('Generate')))
 
@@ -864,3 +868,278 @@ def load_test_media(self, input_media):
             raise RuntimeError(f"Invalid model type {self.model_type}")
 
         return media
+
+
+class SpeechllmModelRunner(MultimodalModelRunner):
+    def __init__(self, perception_engine_dir, llm_engine_dir, modality):
+        """
+        perception_engine_dir: path to the perception engine directory
+                               it should contain:
+                               config.json nemo_config.yaml
+                               perception_encoder.engine : tensorrt engine
+                               feature_extractor.ts  : torchscript model
+        llm_engine_dir: path to the LLM engine directory
+        """
+        super().__init__(perception_engine_dir, llm_engine_dir, modality)
+        assert self.model_type == 'salm'
+        # init preprocessor
+        feature_extractor_path = os.path.join(perception_engine_dir, 'feature_extractor.ts')
+        self.feature_extractor = self.init_speech_preprocessor(feature_extractor_path)
+        self.init_modality_encoder(perception_engine_dir)
+
+    def init_modality_encoder(self, engine_dir):
+        """
+        Initialize the modality encoder session from the prebuilt engine directory
+        Args:
+            engine_dir: str, path to the engine directory
+        """
+        # find file with .engine extension
+        engine_file = None
+        for file in os.listdir(engine_dir):
+            if file.endswith('.engine'):
+                engine_file = file
+                break
+        assert engine_file is not None, f"Engine file not found in {engine_dir}"
+        encoder_path = os.path.join(engine_dir, engine_file)
+        logger.info(f'Loading engine from {encoder_path}')
+        with open(encoder_path, 'rb') as f:
+            engine_buffer = f.read()
+        logger.info(f'Creating session from engine {encoder_path}')
+        self.modality_encoder_session = Session.from_serialized_engine(engine_buffer)
+
+    def init_speech_preprocessor(self, feature_extractor_path):
+        feature_extractor = torch.jit.load(feature_extractor_path)
+        feature_extractor.eval()
+        return feature_extractor
+
+    def process_audio(self, input_signal, input_signal_length):
+        """
+        Args:
+            input_signal: audio signal in numpy array
+            input_signal_length: length of the audio signal in numpy array
+
+        Returns:
+            processed_signal: torch.tensor [B, 80, T]
+            processed_signal_length [B]
+        """
+        input_signal = torch.tensor(input_signal, dtype=torch.float32)
+        input_signal_length = torch.tensor(input_signal_length, dtype=torch.int32)
+        processed_signal, processed_signal_length = self.feature_extractor(input_signal, input_signal_length)
+        return processed_signal, processed_signal_length
+
+    def setup_inputs(self, input_text, input_media, batch_size):
+        """
+        Args:
+            input_text: str or List[str] or None
+            input_media: Tuple[np.array, np.array]
+                input_signal: audio signal in numpy array [b, -1]
+                input_signal_length: length of the audio signal in numpy array [b]
+            batch_size: int
+
+        """
+        input_signal, input_signal_length = input_media
+        processed_signal, processed_signal_length = self.process_audio(input_signal, input_signal_length)
+        processed_signal = processed_signal.to(self.device)
+        processed_signal_length = processed_signal_length.to(self.device)
+        if input_text is None:
+            input_text = "Q: what's the transcription of the audio? A:"
+
+        if isinstance(input_text, str):
+            input_text = [input_text] * batch_size
+
+        assert len(input_text) == batch_size
+        pre_prompt = [''] * batch_size
+        post_prompt = input_text
+        decoder_input_ids = None
+        attention_mask = None
+        return (
+            input_text,
+            pre_prompt,
+            post_prompt,
+            processed_signal,
+            processed_signal_length,
+            decoder_input_ids,
+            attention_mask,
+        )
+
+    def load_test_media(self, input_media_path):
+        """
+        Args:
+            input_media_path: str, path to the audio file
+        Returns:
+            input_signal: np.array [1, -1]
+            input_signal_length: np.array [1]
+        """
+        waveform, sample_rate = sf.read(input_media_path, dtype=np.float32)
+        input_signal = np.array([waveform], dtype=np.float32)
+        input_signal_length = np.array([len(waveform)], dtype=np.int32)
+        return input_signal, input_signal_length
+
+    def get_modality_encoder_features(self, modality_features, attention_mask):
+        """
+        Do inference on the modality encoder engine
+        Args:
+            modality_features: dict {'input1': torch.tensor, 'input2': torch.tensor, ..}
+            attention_mask: None
+        Returns:
+        """
+
+        if attention_mask is not None:
+            modality_features['attention_mask'] = attention_mask
+
+        tensor_info = []
+        for key, tensor in modality_features.items():
+            tensor_info.append(TensorInfo(key, torch_dtype_to_trt(tensor.dtype), tensor.shape))
+
+        output_info = self.modality_encoder_session.infer_shapes(tensor_info)
+
+        outputs = {
+            t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device=self.device)
+            for t in output_info
+        }
+
+        ok = self.modality_encoder_session.run(modality_features, outputs, self.stream.cuda_stream)
+        assert ok, "Runtime execution failed for vision encoder session"
+        self.stream.synchronize()
+
+        return outputs
+
+    def preprocess(self, warmup, pre_prompt, post_prompt, processed_features, attention_mask, batch_size):
+        """
+        Args:
+            warmup: bool
+            pre_prompt: List[str]
+            post_prompt: List[str]
+            processed_features: Tuple[torch.tensor, torch.tensor]
+                processed_signal: torch.tensor [B, 80, T]
+                processed_signal_length: torch.tensor [B]
+            attention_mask: None
+            batch_size: int
+        Returns:
+            input_ids: torch.tensor [B, L]
+            input_lengths: torch.tensor [B]
+            ptuning_args: List[torch.tensor]
+            encoded_features: torch.tensor [B, L, D]
+        """
+        if not warmup:
+            profiler.start(self.modality.capitalize())
+
+        if not warmup:
+            profiler.stop(self.modality.capitalize())
+
+        assert self.model_type == 'salm', f"Invalid model type {self.model_type}"
+
+        processed_features = {
+            "processed_signal": processed_features[0],
+            "processed_signal_length": processed_features[1].to(torch.int32),
+        }
+        encoded_outputs = self.get_modality_encoder_features(processed_features, attention_mask)
+        encoded_features, encoded_length = encoded_outputs['encoded'], encoded_outputs['encoded_length']
+        pre_input_ids = self.tokenizer(pre_prompt).input_ids
+        post_input_ids = self.tokenizer(post_prompt).input_ids
+        input_lengths = []
+        input_ids = []
+        encoded_length = encoded_length.cpu().numpy()
+        fake_id_start = self.model.vocab_size
+        for i in range(batch_size):
+            feat_len = encoded_length[i]
+            feat_fake_ids = np.arange(fake_id_start, fake_id_start + feat_len)
+            cur_input_ids = np.concatenate([pre_input_ids[i], feat_fake_ids, post_input_ids[i]])
+            fake_id_start += feat_len
+            input_lengths.append(len(cur_input_ids))
+            input_ids.append(cur_input_ids)
+
+        max_length = max(input_lengths)
+        # convert input_ids to torch tensor with padding
+        input_ids = [
+            np.pad(ids, (0, max_length - len(ids)), 'constant', constant_values=self.tokenizer.pad_token_id)
+            for ids in input_ids
+        ]
+        input_ids = torch.tensor(input_ids, dtype=torch.int32)
+        input_lengths = torch.tensor(input_lengths, dtype=torch.int32)
+        ptuning_args = self.ptuning_setup(encoded_features, input_ids, input_lengths)
+
+        return input_ids, input_lengths, ptuning_args, encoded_features
+
+    def run(
+        self,
+        input_text,
+        input_media=None,
+        max_new_tokens: int = 30,
+        batch_size: int = 1,
+        top_k: int = 1,
+        top_p: float = 0.0,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.0,
+        num_beams: int = 1,
+        run_profiling=False,
+        check_accuracy=False,
+        input_signal=None,
+        input_signal_length=None,
+    ):
+        """
+        Args:
+            input_text: str or List[str] or None
+            input_media: Tuple[np.array, np.array] or None
+                input_signal: audio signal in numpy array [b, -1]
+                input_signal_length: length of the audio signal in numpy array [b]
+            max_new_tokens: int
+            batch_size: int
+            top_k: int
+            top_p: float
+            temperature: float
+            repetition_penalty: float
+            num_beams: int
+            run_profiling: bool
+            check_accuracy: bool
+        """
+        if input_media is None:
+            assert input_signal is not None and input_signal_length is not None
+            input_media = (input_signal, input_signal_length)
+
+        (
+            input_text,
+            pre_prompt,
+            post_prompt,
+            processed_signal,
+            processed_signal_length,
+            decoder_input_ids,
+            attention_mask,
+        ) = self.setup_inputs(input_text, input_media, batch_size)
+        processed_media = (processed_signal, processed_signal_length)
+
+        self.generate(
+            pre_prompt,
+            post_prompt,
+            processed_media,
+            decoder_input_ids,
+            max_new_tokens,
+            attention_mask=attention_mask,
+            warmup=True,
+            batch_size=batch_size,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            num_beams=num_beams,
+        )
+        num_iters = self.profiling_iterations if run_profiling else 1
+        for _ in range(num_iters):
+            output_text = self.generate(
+                pre_prompt,
+                post_prompt,
+                processed_media,
+                decoder_input_ids,
+                max_new_tokens,
+                attention_mask=attention_mask,
+                warmup=False,
+                batch_size=batch_size,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                num_beams=num_beams,
+            )
+        if self.runtime_rank == 0:
+            self.print_result(input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy)
+        return output_text
diff --git a/nemo/export/tensorrt_mm_exporter.py b/nemo/export/tensorrt_mm_exporter.py
index b0536a55f95f..d4da0ac34b1c 100644
--- a/nemo/export/tensorrt_mm_exporter.py
+++ b/nemo/export/tensorrt_mm_exporter.py
@@ -21,8 +21,8 @@
 import wrapt
 
 from nemo.deploy import ITritonDeployable
-from nemo.export.multimodal.build import build_trtllm_engine, build_visual_engine
-from nemo.export.multimodal.run import MultimodalModelRunner
+from nemo.export.multimodal.build import build_perception_engine, build_trtllm_engine, build_visual_engine
+from nemo.export.multimodal.run import MultimodalModelRunner, SpeechllmModelRunner
 
 use_deploy = True
 try:
@@ -74,9 +74,13 @@ def __init__(
         self,
         model_dir: str,
         load_model: bool = True,
+        modality: str = "vision",
     ):
         self.model_dir = model_dir
         self.runner = None
+        # vision modality is for image and video
+        assert modality in ["vision", "audio"]
+        self.modality = modality
 
         if load_model:
             self._load()
@@ -128,8 +132,12 @@ def export(
             dtype=dtype,
         )
 
-        visual_dir = os.path.join(self.model_dir, "visual_engine")
-        build_visual_engine(visual_dir, visual_checkpoint_path, model_type, vision_max_batch_size)
+        if model_type == "salm":
+            perception_dir = os.path.join(self.model_dir, "perception_engine")
+            build_perception_engine(perception_dir, visual_checkpoint_path, model_type, vision_max_batch_size)
+        else:
+            visual_dir = os.path.join(self.model_dir, "visual_engine")
+            build_visual_engine(visual_dir, visual_checkpoint_path, model_type, vision_max_batch_size)
 
         if load_model:
             self._load()
@@ -164,19 +172,32 @@ def forward(
             num_beams,
         )
 
+    def get_input_media_tensors(self):
+        if self.modality == "vision":
+            return [Tensor(name="input_media", shape=(-1, -1, -1, 3), dtype=np.uint8)]
+        elif self.modality == "audio":
+            return [
+                Tensor(name="input_signal", shape=(-1,), dtype=np.single),
+                Tensor(name="input_signal_length", shape=(1,), dtype=np.intc),
+            ]
+        return []
+
     @property
     def get_triton_input(self):
         inputs = (
-            Tensor(name="input_text", shape=(-1,), dtype=bytes),
-            Tensor(name="input_media", shape=(-1, -1, -1, 3), dtype=np.uint8),
-            Tensor(name="batch_size", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
-            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
-            Tensor(name="repetition_penalty", shape=(-1,), dtype=np.single, optional=True),
-            Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True),
+            [Tensor(name="input_text", shape=(-1,), dtype=bytes)]
+            + self.get_input_media_tensors()
+            + [
+                Tensor(name="batch_size", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
+                Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
+                Tensor(name="repetition_penalty", shape=(-1,), dtype=np.single, optional=True),
+                Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True),
+            ]
         )
+        inputs = tuple(inputs)
         return inputs
 
     @property
@@ -198,6 +219,9 @@ def triton_infer_fn(self, **inputs: np.ndarray):
                 infer_input["input_image"] = ndarray2img(inputs.pop("input_media")[0])[0]
             elif self.runner.model_type in video_model_list:
                 infer_input["input_image"] = inputs.pop("input_media")[0]
+            elif self.runner.model_type == "salm":
+                infer_input["input_signal"] = inputs.pop("input_signal")
+                infer_input["input_signal_length"] = inputs.pop("input_signal_length")[:, 0]
             if "batch_size" in inputs:
                 infer_input["batch_size"] = inputs.pop("batch_size")[0][0]
             if "max_output_len" in inputs:
@@ -223,5 +247,9 @@ def triton_infer_fn(self, **inputs: np.ndarray):
 
     def _load(self):
         llm_dir = os.path.join(self.model_dir, "llm_engine")
-        visual_dir = os.path.join(self.model_dir, "visual_engine")
-        self.runner = MultimodalModelRunner(visual_dir, llm_dir)
+        if self.modality == "vision":
+            visual_dir = os.path.join(self.model_dir, "visual_engine")
+            self.runner = MultimodalModelRunner(visual_dir, llm_dir, self.modality)
+        elif self.modality == "audio":
+            perception_dir = os.path.join(self.model_dir, "perception_engine")
+            self.runner = SpeechllmModelRunner(perception_dir, llm_dir, self.modality)
diff --git a/scripts/deploy/multimodal/deploy_triton.py b/scripts/deploy/multimodal/deploy_triton.py
index d0bf8f10548a..18463a3fc24a 100755
--- a/scripts/deploy/multimodal/deploy_triton.py
+++ b/scripts/deploy/multimodal/deploy_triton.py
@@ -35,6 +35,16 @@ def get_args(argv):
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description=f"Deploy nemo models to Triton",
     )
+    # default modality is vision, can be changed to audio
+    parser.add_argument(
+        "-mod",
+        "--modality",
+        type=str,
+        required=False,
+        default="vision",
+        choices=["vision", "audio"],
+        help="Modality of the model",
+    )
     parser.add_argument("-vc", "--visual_checkpoint", type=str, help="Source .nemo file for visual model")
     parser.add_argument(
         "-lc",
@@ -48,7 +58,7 @@ def get_args(argv):
         "--model_type",
         type=str,
         required=True,
-        choices=["neva", "video-neva", "lita", "vila", "vita"],
+        choices=["neva", "video-neva", "lita", "vila", "vita", "salm"],
         help="Type of the model that is supported.",
     )
     parser.add_argument(
@@ -123,8 +133,7 @@ def get_trt_deployable(args):
         raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
 
     exporter = TensorRTMMExporter(
-        model_dir=trt_path,
-        load_model=(args.visual_checkpoint is None),
+        model_dir=trt_path, load_model=(args.visual_checkpoint is None), modality=args.modality
     )
 
     if args.visual_checkpoint is not None:

From 8524596ec0e70a356040a21b5f3a1e1bb1d3b1ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 26 Aug 2024 20:45:55 -0700
Subject: [PATCH 15/31] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's?=
 =?UTF-8?q?=20bump=20`Dockerfile.ci`=20to=20ef85bc9=20!=20(#10250)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
---
 Dockerfile.ci | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index 161671bf5a5a..3ef2ca64bee7 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0
-ARG MCORE_TAG=124bcff2a8153eccea4d7d0e4df5c5562aab50b9
+ARG MCORE_TAG=ef85bc94fc744aa5d398d12140f808023afbf78d
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \

From f1f145a916f6a07e5d4a3875f60154e9e713fbc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 26 Aug 2024 22:45:49 -0700
Subject: [PATCH 16/31] =?UTF-8?q?[=F0=9F=A4=A0]:=20Howdy=20folks,=20let's?=
 =?UTF-8?q?=20bump=20`Dockerfile.ci`=20to=2001ca03f=20!=20(#10266)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Signed-off-by: oliver könig <okoenig@nvidia.com>
Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com>
---
 Dockerfile.ci | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index 3ef2ca64bee7..e687c385cce8 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -34,7 +34,7 @@ WORKDIR /workspace
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0
-ARG MCORE_TAG=ef85bc94fc744aa5d398d12140f808023afbf78d
+ARG MCORE_TAG=01ca03f11e89f4f85682dcac647c2b913b25fcee
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
 --mount=type=bind,source=requirements,target=requirements \

From 0d1e460bc0516e76920aacda2664a0638e9d1577 Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 27 Aug 2024 16:31:18 +0200
Subject: [PATCH 17/31] Load model in the target export precision by default in
 PTQ (#10267)

* Load model in the target export precision by default

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Enable megatron_amp_O2=true to actually use half-precision

Signed-off-by: Jan Lasek <jlasek@nvidia.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: Jan Lasek <jlasek@nvidia.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
index f603ebb58eb7..62f0e452d3b5 100644
--- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
@@ -17,13 +17,15 @@ trainer:
   num_nodes: 1
   accelerator: gpu
   logger: false # logger provided by exp_manager
-  precision: bf16 # 16, 32, or bf16
+  precision: ${export.dtype} # 16, bf16, or 32
   enable_checkpointing: false
 
 model:
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   restore_from_path: llama2-7b-fp16.nemo # Nemo file path
+  precision: ${export.dtype} # Model weights data type
+  megatron_amp_O2: true # Enable Megatron O2-style half-precision
 
   ## Activation Checkpoint
   activations_checkpoint_granularity: null # 'selective' or 'full'
@@ -42,7 +44,7 @@ export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: ${trainer.precision} # Default precision data type
+  dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16
   save_path: llama2-7b-${quantization.algorithm}.qnemo # Path where the quantized model will be saved
   compress: false # Whether save_path should be a tarball or a directory
   sample_output: true # Whether to run a sample prompt before saving

From f131db2296713549f5cd4bc8ad31cedfc8494414 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Tue, 27 Aug 2024 09:53:08 -0700
Subject: [PATCH 18/31] Add WandbPlugin, NsysPlugin and PreemptionPlugin to
 nemo.lightning.run.plugins (#10223)

* Add WandbPlugin, NsysPlugin and PreemptionPlugin to nemo.lightning.run.plugins

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Remove duplicate

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add entity to wandb logger

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add documentation

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add warning

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* PR feedback

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add comments

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
---
 nemo/collections/llm/recipes/log/default.py |   9 +-
 nemo/lightning/run/__init__.py              |   0
 nemo/lightning/run/plugins.py               | 165 ++++++++++++++++++++
 3 files changed, 172 insertions(+), 2 deletions(-)
 create mode 100644 nemo/lightning/run/__init__.py
 create mode 100644 nemo/lightning/run/plugins.py

diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
index dc18565a0e06..4d5e9223b535 100644
--- a/nemo/collections/llm/recipes/log/default.py
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -10,14 +10,19 @@ def tensorboard_logger(name: str, save_dir: str = "tb_logs") -> Config[TensorBoa
     return Config(TensorBoardLogger, save_dir=save_dir, name=name)
 
 
-def wandb_logger(project: str, name: str) -> Config[WandbLogger]:
-    return Config(
+def wandb_logger(project: str, name: str, entity: Optional[str] = None) -> Config[WandbLogger]:
+    cfg = Config(
         WandbLogger,
         project=project,
         name=name,
         config={},
     )
 
+    if entity:
+        cfg.entity = entity
+
+    return cfg
+
 
 def default_log(
     ckpt_dir: str,
diff --git a/nemo/lightning/run/__init__.py b/nemo/lightning/run/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
new file mode 100644
index 000000000000..0f6a76d4799f
--- /dev/null
+++ b/nemo/lightning/run/plugins.py
@@ -0,0 +1,165 @@
+import copy
+import logging
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+
+import nemo_run as run
+import yaml
+from nemo_run.core.serialization.yaml import YamlSerializer
+from pytorch_lightning import Callback
+from pytorch_lightning.loggers import WandbLogger
+
+from nemo.lightning.pytorch.callbacks import NsysCallback, PreemptionCallback
+from nemo.utils import logging
+
+# This file contains plugins based on NeMo-Run's run.Plugin API.
+# Plugins operate both on a configured task and an executor at the same time, and are specific to NeMo-Run.
+# If you are adding functionality that goes directly into the Pytorch Lightning trainer, you may consider adding a callback instead of a plugin.
+
+
+def _merge_callbacks(partial: run.Partial, callbacks: list[run.Config[Callback]]):
+    if hasattr(partial, "trainer"):
+        if hasattr(partial.trainer, "callbacks"):
+            for callback in callbacks:
+                if callback not in partial.trainer.callbacks:
+                    partial.trainer.callbacks.append(callback)
+        else:
+            partial.trainer.callbacks = copy.deepcopy(callbacks)
+
+
+@dataclass(kw_only=True)
+class PreemptionPlugin(run.Plugin):
+    """
+    A plugin for setting up Preemption callback and preemption signals.
+
+    Args:
+        preempt_time (int): The time, in seconds, before the task's time limit at which the executor
+                             will send a SIGTERM preemption signal. This allows tasks to be gracefully
+                             stopped before reaching their time limit, reducing waste and
+                             promoting fair resource usage. The default value is 300 seconds (5 minutes).
+                             This is only supported for ``run.SlurmExecutor``.
+        callbacks (list[run.Config[Callback]]): A list of callback configurations that the plugin
+                                                will merge with the task's existing callbacks.
+                                                By default, the list includes NeMo's preemption callback.
+    """
+
+    preempt_time: int = 300
+    callbacks: list[run.Config[Callback]] = field(default_factory=lambda: [run.Config(PreemptionCallback)])
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        if isinstance(task, run.Script):
+            logging.warning(
+                f"The {self.__class__.__name__} will have no effect on the task as it's an instance of run.Script"
+            )
+            return
+
+        if isinstance(executor, run.SlurmExecutor):
+            # Sends a SIGTERM self.preempt_time seconds before hitting time limit
+            logging.info(
+                f"{self.__class__.__name__} will send a SIGTERM {self.preempt_time} seconds before the job's time limit for your Slurm executor."
+            )
+            executor.signal = f"TERM@{self.preempt_time}"
+
+        _merge_callbacks(task, callbacks=self.callbacks)
+
+
+@dataclass(kw_only=True)
+class NsysPlugin(run.Plugin):
+    """
+    A plugin for nsys profiling.
+
+    The NsysPlugin allows you to profile your run using nsys.
+    You can specify when to start and end the profiling, on which ranks to run the profiling,
+    and what to trace during profiling.
+
+    Args:
+        start_step (int): The step at which to start the nsys profiling.
+        end_step (int): The step at which to end the nsys profiling.
+        ranks (Optional[list[int]]): The ranks on which to run the nsys profiling. If not specified,
+            profiling will be run on rank 0.
+        nsys_trace (Optional[list[str]]): The events to trace during profiling. If not specified,
+            'nvtx' and 'cuda' events will be traced.
+    """
+
+    start_step: int
+    end_step: int
+    ranks: Optional[list[int]] = None
+    nsys_trace: Optional[list[str]] = None
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        if isinstance(task, run.Partial):
+            nsys_callback = run.Config(
+                NsysCallback,
+                start_step=self.start_step,
+                end_step=self.end_step,
+                ranks=self.ranks or [0],
+            )
+            callbacks: list[run.Config[Callback]] = [nsys_callback]  # type: ignore
+            _merge_callbacks(task, callbacks=callbacks)
+
+        launcher = executor.get_launcher()
+        launcher.nsys_profile = True
+        launcher.nsys_trace = self.nsys_trace or ["nvtx", "cuda"]
+
+
+@dataclass(kw_only=True)
+class WandbPlugin(run.Plugin):
+    """
+    A plugin for setting up Weights & Biases.
+
+    This plugin sets a ``WandbLogger`` to ``NeMoLogger``'s ``wandb`` arg,
+    which in turn initializes the Pytorch Lightning `WandbLogger <https://lightning.ai/docs/pytorch/stable/extensions/generated/lightning.pytorch.loggers.WandbLogger.html>`_.
+
+    This plugin is only activated if the ``WANDB_API_KEY`` environment variable is set.
+    The ``WANDB_API_KEY`` environment variables will also be set in the executor's environment variables.
+    Follow https://docs.wandb.ai/quickstart to retrieve your ``WANDB_API_KEY``.
+
+    If `log_task_config` is True, the plugin will log the task configuration as a config dictionary
+    to the Weights and Biases logger.
+
+    Args:
+        name (str): The name for the Weights & Biases run.
+        logger_fn (Callable[..., run.Config[WandbLogger]]): A callable that returns a Config of ``WandbLogger``
+        log_task_config (bool, optional): Whether to log the task configuration to the logger.
+            Defaults to True.
+
+    Raises:
+        logging.warning: If the task is an instance of `run.Script`, as the plugin has no effect on such tasks.
+    """
+
+    name: str
+    logger_fn: Callable[..., run.Config[WandbLogger]]
+    log_task_config: bool = True
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        if isinstance(task, run.Script):
+            logging.warning(
+                f"The {self.__class__.__name__} will have no effect on the task as it's an instance of run.Script"
+            )
+            return
+
+        if "WANDB_API_KEY" in os.environ:
+            executor.env_vars["WANDB_API_KEY"] = os.environ["WANDB_API_KEY"]
+
+            if hasattr(task, "log") and hasattr(task.log, "wandb"):
+                task.log.wandb = self.logger_fn(name=self.name)
+                if self.log_task_config:
+                    partial_config = yaml.safe_load(YamlSerializer().serialize(task))
+                    partial_config["experiment"] = {
+                        "id": self.experiment_id,
+                        "task_name": self.name,
+                        "executor": executor.info(),
+                        "remote_directory": (
+                            os.path.join(executor.tunnel.job_dir, Path(executor.job_dir).name)
+                            if isinstance(executor, run.SlurmExecutor)
+                            else None
+                        ),
+                        "local_directory": executor.job_dir,
+                    }
+                    task.log.wandb.config = partial_config
+        else:
+            logging.warning(
+                f"The {self.__class__.__name__} will have no effect as WANDB_API_KEY environment variable is not set."
+            )

From 86dcd990ceb1b3b1b4bc2a8585aaa62f788abcc7 Mon Sep 17 00:00:00 2001
From: Anna Shors <71393111+ashors1@users.noreply.github.com>
Date: Tue, 27 Aug 2024 14:51:20 -0700
Subject: [PATCH 19/31] [NeMo-UX] Handle absolute logger directories in
 nemo_logger (#10259)

* handle absolute and relative logger directories

Signed-off-by: Anna Shors <ashors@nvidia.com>

* merge lines

Signed-off-by: ashors1 <ashors@nvidia.com>

---------

Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo/lightning/nemo_logger.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py
index 6509c384f8cf..bae62f09593b 100644
--- a/nemo/lightning/nemo_logger.py
+++ b/nemo/lightning/nemo_logger.py
@@ -30,11 +30,10 @@ class NeMoLogger(IOMixin):
         log_global_rank_0_only (bool): Log only on global rank 0.
         files_to_copy (Optional[List[str]]): List of files to copy to log directory.
         update_logger_directory (bool): Whether to update logger directory to write to `exp_dir`.
-            If True, the `save_dir` passed to the logger will be treated as a relative path and
-            the logger will be reconfigured to write to `exp_dir / save_dir`. This ensures that
-            all output from an experiment is written to a common directory. If False, the logger's
-            save_dir will not be overwritten. This argument applies only to TensorBoardLogger and
-            WandbLogger instances.
+            If True, the `save_dir` passed to the logger will be reconfigured to write to `exp_dir / save_dir`.
+            This ensures that all output from an experiment is written to a common directory.
+            If False, the logger's save_dir will not be overwritten.
+            This argument applies only to TensorBoardLogger and WandbLogger instances.
         ckpt (Optional[ModelCheckpoint]): Model checkpoint callback.
         tensorboard: (Optional[TensorBoardLogger]): A PyTorch Lightning TensorBoardLogger instance
             to add to the trainer.
@@ -158,7 +157,7 @@ def _setup_trainer_loggers(self, trainer, dir, version):
             for logger in trainer.loggers:
                 if isinstance(logger, TensorBoardLogger):
                     logger._version = version or ""
-                    logger._root_dir = Path(dir) / logger.save_dir
+                    logger._root_dir = Path(dir) / os.path.relpath(logger.save_dir)
                     trainer.logger._name = self.name
                     logging.warning(
                         f'"update_logger_directory" is True. Overwriting tensorboard logger "save_dir" to {logger._root_dir}'

From 97ce34abaaef93750e15589cdc56aeaafda074db Mon Sep 17 00:00:00 2001
From: Ming <111467530+Victor49152@users.noreply.github.com>
Date: Tue, 27 Aug 2024 15:20:22 -0700
Subject: [PATCH 20/31] Add sdxl notebook (#10139)

* Add sdxl notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* Rename

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

* final Update SDXL notebook

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>

---------

Signed-off-by: mingyuanm <mingyuanm@nvidia.com>
---
 docs/source/multimodal/text2img/sd.rst   |   2 +-
 tutorials/multimodal/SDXL Tutorial.ipynb | 253 +++++++++++++++++++++++
 2 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 tutorials/multimodal/SDXL Tutorial.ipynb

diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst
index 6f5092f93f5f..549f13bbabf6 100644
--- a/docs/source/multimodal/text2img/sd.rst
+++ b/docs/source/multimodal/text2img/sd.rst
@@ -163,7 +163,7 @@ Optimization related configurations
 Training with precached latents
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
+Since the VAE and text encoder remain frozen during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
 
 Reference
 -----------
diff --git a/tutorials/multimodal/SDXL Tutorial.ipynb b/tutorials/multimodal/SDXL Tutorial.ipynb
new file mode 100644
index 000000000000..92667100b405
--- /dev/null
+++ b/tutorials/multimodal/SDXL Tutorial.ipynb	
@@ -0,0 +1,253 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d874e23f-9631-48e0-b635-84e7280bf07b",
+   "metadata": {},
+   "source": [
+    "# SDXL Training / Inference Tutorial\n",
+    "\n",
+    "### Note:\n",
+    "Currently, this notebook must be run in a NeMo container (> 24.09) and open_clip_torch<=2.24.0. An example command to launch the container:\n",
+    "\n",
+    "```\n",
+    "docker run --gpus all -it --rm -v <your_nemo_dir>:/opt/NeMo -v <your_dataset_dir>:/datasets --shm-size=8g \\\n",
+    "     -p 8888:8888 --ulimit memlock=-1 --ulimit \\\n",
+    "      stack=67108864 <your_nemo_container>\n",
+    "```\n",
+    "\n",
+    "\n",
+    "## Introduction\n",
+    "\n",
+    "This notebook illustrates how to train and perform inference using Stable Diffusion XL with the NeMo Toolkit. Despite differences in model configs, the training and inference procedure is similar as Stable Diffusion.\n",
+    "\n",
+    "The implementation of Stable Diffusion XL is based on [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/abs/2307.01952).\n",
+    "\n",
+    "This tutorial will guide you through the following topics:\n",
+    "\n",
+    "1. Training a Stable Diffusion XL model.\n",
+    "2. Performing inference with the trained model.\n",
+    "\n",
+    "## Datasets\n",
+    "\n",
+    "Please refer to [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for how to prepare a training dataset for Stable diffusion XL.\n",
+    "\n",
+    "For a pre-cached Stable Diffusion dataset, each webdataset tar file should, at a minimum, include the pickle files that store the pre-cached image and text features:\n",
+    "\n",
+    "```\n",
+    "t0_r0_0.tar\n",
+    "|---- 0000.pickle\n",
+    "|---- 0001.pickle\n",
+    "...\n",
+    "```\n",
+    "\n",
+    "For non-precached Stable Diffusion dataset, each webdataset tar file should contain the raw texts and corresponding images:\n",
+    "\n",
+    "```\n",
+    "t0_r0_0.tar\n",
+    "|---- 0000.jpg\n",
+    "|---- 0000.txt\n",
+    "|---- 0001.jpg\n",
+    "|---- 0001.txt\n",
+    "...\n",
+    "```\n",
+    "\n",
+    "## Encoders Preparation\n",
+    "\n",
+    "Depending on whether you precache the dataset, you might also need to first download the image and/or text encoders.\n",
+    "\n",
+    "### Option 1: Training on Non-Precached Dataset (Use Encoders During Training)\n",
+    "\n",
+    "#### A. Prepare VAE\n",
+    "To download the default VAE for Stable Diffusion:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "730cd137-0fce-4bab-8ac7-219e5c55faf2",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "! wget https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/vae/diffusion_pytorch_model.safetensors\n",
+    "! mkdir -p /sdxl_ckpts\n",
+    "! mv diffusion_pytorch_model.safetensors /sdxl_ckpts/vae.safetensors"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "fef8b245-7cee-4048-a9ec-3ada90432a89",
+   "metadata": {},
+   "source": [
+    "The above command will download the default VAE weights from HuggingFace and save it to `/sdxl_ckpts/vae.safetensors`.\n",
+    "\n",
+    "**Note**: if you want to customize the saved location, make sure it is also reflected in your training config.\n",
+    "#### B. Prepare Text Encoder\n",
+    "For the text encoders used in Stable Diffusion XL, it will be automatically downloaded by the training script we provide.\n",
+    "\n",
+    "The type of text encoder used in the sdxl model conditioner can be found in `conditioner_config` in the predefined training configs:\n",
+    "\n",
+    "```\n",
+    "  conditioner_config:\n",
+    "    _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.GeneralConditioner\n",
+    "    emb_models:\n",
+    "      - is_trainable: false\n",
+    "        input_key: captions\n",
+    "        ucg_rate: 0.1\n",
+    "        emb_model:\n",
+    "          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenCLIPEmbedder\n",
+    "          layer: hidden\n",
+    "          layer_idx: 11\n",
+    "      - is_trainable: false\n",
+    "        ucg_rate: 0.1\n",
+    "        input_key: captions\n",
+    "        emb_model:\n",
+    "          _target_: nemo.collections.multimodal.modules.stable_diffusion.encoders.modules.FrozenOpenCLIPEmbedder2\n",
+    "          arch: ViT-bigG-14\n",
+    "          version: laion2b_s39b_b160k\n",
+    "          freeze: true\n",
+    "          layer: penultimate\n",
+    "          always_return_pooled: true\n",
+    "          legacy: false\n",
+    "```"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "8854eb7a-e822-43f6-a1d5-12357049485a",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Option 2: Training on Precached Dataset (Training UNet Only)\n",
+    "\n",
+    "When using precached dataset (please refer to the [Dataset Tutorial](https://github.com/NVIDIA/NeMo/blob/main/tutorials/multimodal/Multimodal%20Data%20Preparation.ipynb) for details), every text feature and image feature are stored as key-value pairs in `.pickle` file:\n",
+    "\n",
+    "```\n",
+    "{\n",
+    " image_key: torch.Tensor(),\n",
+    " text_key: torch.Tensor(),\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "Make sure in the training config, `cond_stage_key` is associated with `text_key` and `first_stage_key` is associated with `image_key`.\n",
+    "\n",
+    "We offer an expample script to convert a dataset from `parquet` file to webdataset `tar` files at [parquet_conversion](https://github.com/NVIDIA/NeMo/blob/main/scripts/multimodal_dataset_conversion/parquet_conversion.py). Three different modes of prechaed training are provided, they are:\n",
+    "\n",
+    "1. No Caching: VAE and Text encoders are loaded during training\n",
+    "2. Text only: Only text features are loaded from dataset during training\n",
+    "3. Both: Both image and text features are loaded from dataset during training\n",
+    "\n",
+    "In each mode, the cached components should be saved in its raw format in tarfiles while cached components should be saved as torch.Tensor()."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "5762427b-f60c-4dfd-8318-e55771b25354",
+   "metadata": {},
+   "source": [
+    "## Model Config Setup\n",
+    "\n",
+    "Now we will begin setting up the config file needed for Stable Diffusion training. We will use [sd_train.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_base_train.yaml) as the template.\n",
+    "\n",
+    "1. Modify `model.data.train.dataset_path` so that it has all the webdataset info files you want to train on\n",
+    "2. Modify `model.data.webdataset.local_root_path` to point to your dataset path\n",
+    "3. Make sure VAE path `model.first_stage_config.from_pretrained` is adjusted if using non-precached dataset\n",
+    "4. Make sure the `model.precache mode` is set properly with the dataset you prepared, as detailed above.\n",
+    "5. Configure `exp_manager.exp_dir` for experiment save directory\n",
+    "6. Configure `exp_manager.wandb_logger_kwargs` and/or `exp_manager.create_tensorboard_logger` if needed"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "70f858b3-f7d5-4678-b380-80582337bc23",
+   "metadata": {},
+   "source": [
+    "**Note**: Please refer to NeMo Toolkit Developer Guide's Stable Diffusion page for more details on in-depth customizations, including all available optimizations.\n",
+    "\n",
+    "## Training\n",
+    "\n",
+    "Once everything is set up, training stable diffusion is as simple as running:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "589e3a14-c881-4a56-b2bd-370653059dfc",
+   "metadata": {},
+   "outputs": [],
+   "source": "! torchrun /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_train.py trainer.max_steps=100 model.data.train.dataset_path=/path/to/wdinfo.pkl model.data.webdataset.local_root_path=/path/to/dataset trainer.devices=1 trainer.num_nodes=1 model.micro_batch_size=1 model.global_batch_size=1 model.first_stage_config.from_pretrained=/sdxl_ckpts/vae.safetensors model.fsdp=False"
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "892d72dd-c4d7-4ca4-a948-168e187af65c",
+   "metadata": {},
+   "source": [
+    "Intermediate checkpoints (during training) and final checkpoint will be saved to `exp_manager.exp_dir` folder. Note that here we use synthetic data for demo purpose."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "087c8b9a-92c3-43d3-86a3-bf7e848dfbd2",
+   "metadata": {},
+   "source": [
+    "## Inference\n",
+    "\n",
+    "Stable Diffusion XL inference needs a trained NeMo Stable Diffusion checkpoint, along with both the image encoder (VAE) and text encoder (CLIP). The checkpoint can be either a fully trained `.nemo` checkpoint or an intermediate checkpoint from training (typically in `.ckpt` format). \n",
+    "\n",
+    "### Inference Config Setup\n",
+    "\n",
+    "Now we will begin setting up the config file needed for Stable Diffusion inference. We will use [sd_xl_infer_v2.yaml](https://github.com/NVIDIA/NeMo/blob/main/examples/multimodal/text_to_image/stable_diffusion/conf/sd_xl_infer_v2.yaml) as the template.\n",
+    "\n",
+    "We generally use [Classifier Free Guidance](https://arxiv.org/abs/2207.12598) for better visual quality, which can be set at `sampling.base.scale`.\n",
+    "\n",
+    "NeMo Stable Diffusion supports multiple samplers. Please refer to the developer guide for more details. Samplers can be set at `sampling.base.sampler`.\n",
+    "\n",
+    "Inference supports a batch of text prompts, which can be set at `infer.prompt`. One can also generate a configurable number of images per prompt by setting `infer.num_samples`. Generated images will be saved to `out_path`.\n",
+    "\n",
+    "You will also need to set the model checkpoint path at `model.restore_from_path` if you are loading from `.nemo` checkpoint, otherwise, mannually set `unet` checkpoints and `vae` checkpoint at `model.unet_config.from_pretrained` and `model.first_stage_config.from_pretrained`, respectively.\n",
+    "\n",
+    "### Running the Inference\n",
+    "\n",
+    "Once everything is set up, Stable Diffusion inference is as simple as running:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e676c5d-d711-489e-8ab7-3ee20046d88d",
+   "metadata": {},
+   "outputs": [],
+   "source": "! torchrun  /opt/NeMo/examples/multimodal/text_to_image/stable_diffusion/sd_xl_infer.py model.restore_from_path=/path/to/stable-diffusion-xl-train.nemo out_path=/sdxl_infer_out"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From c52a0a4adcd920664151acac531d4dd95ef6d3bb Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 27 Aug 2024 15:32:28 -0700
Subject: [PATCH 21/31] Updating some coments

---
 nemo/export/tensorrt_llm.py | 51 ++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index f62df8249e92..8389e2b519ad 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -169,6 +169,7 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
+        use_mcore_path: bool = True, 
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
@@ -203,6 +204,7 @@ def export(
             multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
             gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
             gemm_plugin (str): enable the gpt plugin. Default = "auto"
+            use_mcore_path (bool) : Use the more recent mcore path for export
         """
         if n_gpus is not None:
             warnings.warn(
@@ -306,9 +308,10 @@ def export(
                     )
 
                 model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
-                USE_NEW_CODE = True
 
-                if USE_NEW_CODE:
+                if use_mcore_path:
+                    from megatron.core.export.model_type import ModelType
+                    from megatron.core.export.model_config import ModelConfig
                     from megatron.core.export.data_type import DataType
                     from megatron.core.export.export_config import ExportConfig
                     from megatron.core.export.model_config import ModelConfig
@@ -347,40 +350,26 @@ def get_model_config(nemo_model_config):
                         )
                         conf.moe_tp_mode = nemo_model_config.get('moe_tp_mode', 2)
                         conf.seq_len_interpolation_factor = nemo_model_config.get("seq_len_interpolation_factor")
-                        conf.mcore_gpt = nemo_model_config.get("mcore_gpt", False)
-                        conf.share_embeddings_and_output_weights = nemo_model_config.get(
-                            "share_embeddings_and_output_weights", False
-                        )
+                        conf.share_embeddings_and_output_weights = nemo_model_config.get("share_embeddings_and_output_weights", False)
                         conf.apply_embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
                         conf.multi_query_mode = nemo_model_config.get("multi_query_mode", False)
-                        conf.normalization = nemo_model_config.get("normalization", "")
-                        conf.precision = nemo_model_config.get("precision")
                         return conf
-
-                    input_model_config = get_model_config(model_configs)
+                    
+                    # We use a unified model config to support nemo and mcore. So we convert nemo config to this model config
+                    input_model_config = get_model_config(model_configs)   
                     input_model_type = getattr(ModelType, model_type)
+
+                    # MCore export supports some default conversion dictionaries
                     mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
-                    nemo_model_conversion_dict = {
-                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
-                    }
-                    trtllm_helper = TRTLLMHelper(
-                        input_model_config, input_model_type, trtllm_conversion_dict=nemo_model_conversion_dict
-                    )
+                    # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
+                    nemo_model_conversion_dict = {f'model.{key}':value for key, value in mcore_model_conversion_dict.items()}
 
-                    input_dtype = getattr(DataType, dtype)
-                    export_config = ExportConfig(
-                        tensor_parallelism_size,
-                        pipeline_parallelism_size,
-                        use_parallel_embedding,
-                        use_embedding_sharing,
-                        gpus_per_node,
-                    )
+                    trtllm_helper = TRTLLMHelper(input_model_config, input_model_type, trtllm_conversion_dict = nemo_model_conversion_dict)
 
-                    trtllm_model_weights_list, trtllm_model_config_list = (
-                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
-                            model_state_dict=model, export_config=export_config, dtype=input_dtype
-                        )
-                    )
+                    input_dtype = getattr(DataType, dtype)
+                    export_config = ExportConfig(tensor_parallelism_size, pipeline_parallelism_size, use_parallel_embedding, use_embedding_sharing)
+                   
+                    trtllm_model_weights_list, trtllm_model_config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(model_state_dict = model, export_config = export_config, dtype = input_dtype, num_process = 1)
 
                     for trtllm_model_weights, trtllm_model_config in zip(
                         trtllm_model_weights_list, trtllm_model_config_list
@@ -397,11 +386,9 @@ def get_model_config(nemo_model_config):
                             max_lora_rank=max_lora_rank,
                             lora_target_modules=lora_target_modules,
                             max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                            enable_multi_block_mode=False,
                             paged_kv_cache=paged_kv_cache,
                             remove_input_padding=remove_input_padding,
                             paged_context_fmha=paged_context_fmha,
-                            use_custom_all_reduce=True,
                             use_refit=False,
                             max_num_tokens=max_num_tokens,
                             max_seq_len=max_seq_len,
@@ -425,7 +412,7 @@ def get_model_config(nemo_model_config):
                         decoder_type=model_type,
                         dtype=dtype,
                         tensor_parallel_size=tensor_parallelism_size,
-                        pipeline_parallel_size=pipeline_parallelism_size,
+                        pipeline_parallel_size=2,
                         gpus_per_node=gpus_per_node,
                         use_parallel_embedding=use_parallel_embedding,
                         use_embedding_sharing=use_embedding_sharing,

From ed26d899c65c270279de0c96ef32abd524957b7a Mon Sep 17 00:00:00 2001
From: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Date: Tue, 27 Aug 2024 22:43:38 +0000
Subject: [PATCH 22/31] Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 8389e2b519ad..ffbaa58d037c 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -169,7 +169,7 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
-        use_mcore_path: bool = True, 
+        use_mcore_path: bool = True,
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.
@@ -310,8 +310,6 @@ def export(
                 model, model_configs, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
 
                 if use_mcore_path:
-                    from megatron.core.export.model_type import ModelType
-                    from megatron.core.export.model_config import ModelConfig
                     from megatron.core.export.data_type import DataType
                     from megatron.core.export.export_config import ExportConfig
                     from megatron.core.export.model_config import ModelConfig
@@ -350,26 +348,41 @@ def get_model_config(nemo_model_config):
                         )
                         conf.moe_tp_mode = nemo_model_config.get('moe_tp_mode', 2)
                         conf.seq_len_interpolation_factor = nemo_model_config.get("seq_len_interpolation_factor")
-                        conf.share_embeddings_and_output_weights = nemo_model_config.get("share_embeddings_and_output_weights", False)
+                        conf.share_embeddings_and_output_weights = nemo_model_config.get(
+                            "share_embeddings_and_output_weights", False
+                        )
                         conf.apply_embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
                         conf.multi_query_mode = nemo_model_config.get("multi_query_mode", False)
                         return conf
-                    
+
                     # We use a unified model config to support nemo and mcore. So we convert nemo config to this model config
-                    input_model_config = get_model_config(model_configs)   
+                    input_model_config = get_model_config(model_configs)
                     input_model_type = getattr(ModelType, model_type)
 
                     # MCore export supports some default conversion dictionaries
                     mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT[input_model_type]
                     # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
-                    nemo_model_conversion_dict = {f'model.{key}':value for key, value in mcore_model_conversion_dict.items()}
+                    nemo_model_conversion_dict = {
+                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    }
 
-                    trtllm_helper = TRTLLMHelper(input_model_config, input_model_type, trtllm_conversion_dict = nemo_model_conversion_dict)
+                    trtllm_helper = TRTLLMHelper(
+                        input_model_config, input_model_type, trtllm_conversion_dict=nemo_model_conversion_dict
+                    )
 
                     input_dtype = getattr(DataType, dtype)
-                    export_config = ExportConfig(tensor_parallelism_size, pipeline_parallelism_size, use_parallel_embedding, use_embedding_sharing)
-                   
-                    trtllm_model_weights_list, trtllm_model_config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(model_state_dict = model, export_config = export_config, dtype = input_dtype, num_process = 1)
+                    export_config = ExportConfig(
+                        tensor_parallelism_size,
+                        pipeline_parallelism_size,
+                        use_parallel_embedding,
+                        use_embedding_sharing,
+                    )
+
+                    trtllm_model_weights_list, trtllm_model_config_list = (
+                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                            model_state_dict=model, export_config=export_config, dtype=input_dtype, num_process=1
+                        )
+                    )
 
                     for trtllm_model_weights, trtllm_model_config in zip(
                         trtllm_model_weights_list, trtllm_model_config_list

From e3c52837a5530420c6784ceedd80ea0df327ef7d Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 27 Aug 2024 15:46:50 -0700
Subject: [PATCH 23/31] Updating some coments

---
 nemo/export/tensorrt_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index ffbaa58d037c..e32a1cc7ff25 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -169,7 +169,7 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
-        use_mcore_path: bool = True,
+        use_mcore_path: bool = False, 
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.

From 1b07bd15dc4690fb7173454d3f6f2a4406f4e387 Mon Sep 17 00:00:00 2001
From: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Date: Tue, 27 Aug 2024 22:49:38 +0000
Subject: [PATCH 24/31] Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index e32a1cc7ff25..3ae297e53995 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -169,7 +169,7 @@ def export(
         multiple_profiles: bool = False,
         gpt_attention_plugin: str = "auto",
         gemm_plugin: str = "auto",
-        use_mcore_path: bool = False, 
+        use_mcore_path: bool = False,
     ):
         """
         Exports nemo checkpoints to TensorRT-LLM.

From 3c1e2c10bbf746be795d282c1fe0ef5747564d30 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@login-eos01.eos.clusters.nvidia.com>
Date: Tue, 27 Aug 2024 15:50:42 -0700
Subject: [PATCH 25/31] Updating some coments

---
 nemo/export/trt_llm/converter/model_converter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index 2012f1b16983..337a0a4e4e77 100755
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -38,6 +38,8 @@ def get_config(decoder_type, config):
         "llama": tensorrt_llm.models.llama.config.LLaMAConfig,
         "gpt": tensorrt_llm.models.gpt.config.GPTConfig,
         "gptnext": tensorrt_llm.models.gpt.config.GPTConfig,
+        "falcon": tensorrt_llm.models.falcon.config.FalconConfig,
+        "gemma": tensorrt_llm.models.GemmaConfig,
     }
     config_cls = DECODER_CONFIG[decoder_type] if decoder_type in DECODER_CONFIG else PretrainedConfig
 
@@ -179,7 +181,6 @@ def model_to_trtllm_ckpt(
         'tp_size': tensor_parallel_size,
         'pp_size': pipeline_parallel_size,
     }
-
     model_configs = []
     weights_dicts = []
     num_layers = nemo_model_config.get('num_layers')

From 25b0e95436972e3042022384bd9b55ae5261adc8 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Date: Mon, 16 Sep 2024 13:07:05 -0700
Subject: [PATCH 26/31] Small change

---
 nemo/export/tensorrt_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 3ae297e53995..645367397406 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -380,7 +380,7 @@ def get_model_config(nemo_model_config):
 
                     trtllm_model_weights_list, trtllm_model_config_list = (
                         trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
-                            model_state_dict=model, export_config=export_config, dtype=input_dtype, num_process=1
+                            model_state_dict=model, export_config=export_config, dtype=input_dtype, num_process=1, state_dict_split_by_layer_numbers=False
                         )
                     )
 

From f70c1da259b2a3e6041cff5f336540b456602e6b Mon Sep 17 00:00:00 2001
From: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Date: Mon, 16 Sep 2024 20:07:55 +0000
Subject: [PATCH 27/31] Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 645367397406..9c9879bcf110 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -380,7 +380,11 @@ def get_model_config(nemo_model_config):
 
                     trtllm_model_weights_list, trtllm_model_config_list = (
                         trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
-                            model_state_dict=model, export_config=export_config, dtype=input_dtype, num_process=1, state_dict_split_by_layer_numbers=False
+                            model_state_dict=model,
+                            export_config=export_config,
+                            dtype=input_dtype,
+                            num_process=1,
+                            state_dict_split_by_layer_numbers=False,
                         )
                     )
 

From 822ec5b2de3e722ed23b46adbaf4838a15b9d20f Mon Sep 17 00:00:00 2001
From: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Date: Mon, 23 Sep 2024 18:02:05 +0000
Subject: [PATCH 28/31] Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py | 62 ++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index e07a6b2c001a..5051155afed4 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -324,33 +324,33 @@ def export(
                 if use_mcore_path:
                     from megatron.core.export.data_type import DataType
                     from megatron.core.export.export_config import ExportConfig
-                    from megatron.core.transformer.transformer_config import TransformerConfig
                     from megatron.core.export.model_type import ModelType
                     from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
                         DEFAULT_CONVERSION_DICT,
                     )
                     from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+                    from megatron.core.transformer.transformer_config import TransformerConfig
                     from tensorrt_llm.layers import MoeConfig
 
                     def get_transformer_config(nemo_model_config):
                         conf = TransformerConfig(
-                            num_layers = nemo_model_config.get('num_layers'),
-                            moe_router_topk = nemo_model_config.get('moe_router_topk', 0),
-                            num_attention_heads = nemo_model_config.get('num_attention_heads'),
-                            num_query_groups = nemo_model_config.get(
-                            'num_query_groups', nemo_model_config['num_attention_heads']
+                            num_layers=nemo_model_config.get('num_layers'),
+                            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
+                            num_attention_heads=nemo_model_config.get('num_attention_heads'),
+                            num_query_groups=nemo_model_config.get(
+                                'num_query_groups', nemo_model_config['num_attention_heads']
                             ),
-                            kv_channels = nemo_model_config.get("kv_channels", None),
-                            hidden_size = nemo_model_config.get('hidden_size'),
-                            ffn_hidden_size = nemo_model_config.get('ffn_hidden_size'),
-                            layernorm_epsilon = nemo_model_config.get('layernorm_epsilon'),
-                            add_bias_linear = nemo_model_config.get('bias'),
-                            num_moe_experts = nemo_model_config.get('num_moe_experts', None),             
+                            kv_channels=nemo_model_config.get("kv_channels", None),
+                            hidden_size=nemo_model_config.get('hidden_size'),
+                            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
+                            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
+                            add_bias_linear=nemo_model_config.get('bias'),
+                            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
                         )
 
                         return conf
 
-                    # We build the transformer config using the nemo model config. 
+                    # We build the transformer config using the nemo model config.
                     transformer_config = get_transformer_config(model_configs)
                     input_model_type = getattr(ModelType, model_type)
 
@@ -362,19 +362,23 @@ def get_transformer_config(nemo_model_config):
                     }
 
                     trtllm_helper = TRTLLMHelper(
-                        transformer_config = transformer_config, 
-                        model_type = input_model_type, 
-                        trtllm_conversion_dict = nemo_model_conversion_dict,
-                        position_embedding_type = model_configs.get('position_embedding_type'),
-                        max_position_embeddings = model_configs.get('max_position_embeddings'),
-                        rotary_percentage = model_configs.get('rotary_percentage', 1.0),
-                        rotary_base = model_configs.get('rotary_base', 10000),
-                        moe_tp_mode = model_configs.get('moe_tp_mode', 2),
-                        multi_query_mode = model_configs.get("multi_query_mode", False),
-                        activation = model_configs.get('activation', "gelu"),
-                        seq_len_interpolation_factor = model_configs.get("seq_len_interpolation_factor"),
-                        moe_renorm_mode = model_configs.get('moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE),
-                        share_embeddings_and_output_weights = model_configs.get("share_embeddings_and_output_weights", False),
+                        transformer_config=transformer_config,
+                        model_type=input_model_type,
+                        trtllm_conversion_dict=nemo_model_conversion_dict,
+                        position_embedding_type=model_configs.get('position_embedding_type'),
+                        max_position_embeddings=model_configs.get('max_position_embeddings'),
+                        rotary_percentage=model_configs.get('rotary_percentage', 1.0),
+                        rotary_base=model_configs.get('rotary_base', 10000),
+                        moe_tp_mode=model_configs.get('moe_tp_mode', 2),
+                        multi_query_mode=model_configs.get("multi_query_mode", False),
+                        activation=model_configs.get('activation', "gelu"),
+                        seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode=model_configs.get(
+                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                        ),
+                        share_embeddings_and_output_weights=model_configs.get(
+                            "share_embeddings_and_output_weights", False
+                        ),
                     )
 
                     input_dtype = getattr(DataType, dtype)
@@ -428,7 +432,7 @@ def get_transformer_config(nemo_model_config):
 
                     if model_type == "mixtral":
                         model_type = "llama"
- 
+
                     weights_dicts, model_configs = model_to_trtllm_ckpt(
                         model=model,
                         nemo_model_config=model_configs,
@@ -442,8 +446,8 @@ def get_transformer_config(nemo_model_config):
                         use_embedding_sharing=use_embedding_sharing,
                         fp8_quantized=fp8_quantized,
                         fp8_kvcache=fp8_kvcache,
-                    )                    
-                    
+                    )
+
                     for weight_dict, model_config in zip(weights_dicts, model_configs):
                         build_and_save_engine(
                             max_input_len=max_input_len,

From e05fe2cd558872b696ecc5238afdbf1e3760c3e1 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <shanmugamr@shanmugamr-mlt.client.nvidia.com>
Date: Wed, 25 Sep 2024 10:53:01 -0700
Subject: [PATCH 29/31] ADD support for layernorm1p

---
 nemo/export/tensorrt_llm.py | 76 ++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index 5051155afed4..e539983282ed 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -324,33 +324,43 @@ def export(
                 if use_mcore_path:
                     from megatron.core.export.data_type import DataType
                     from megatron.core.export.export_config import ExportConfig
+                    from megatron.core.transformer.transformer_config import TransformerConfig
                     from megatron.core.export.model_type import ModelType
                     from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
                         DEFAULT_CONVERSION_DICT,
                     )
                     from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
-                    from megatron.core.transformer.transformer_config import TransformerConfig
                     from tensorrt_llm.layers import MoeConfig
 
                     def get_transformer_config(nemo_model_config):
+                        normalization = nemo_model_config.get('normalization', 'layernorm')
+                        transformer_config_normalization = 'LayerNorm'
+                        layernorm_zero_centered_gamma = False
+                        if normalization == 'layernorm1p':
+                            layernorm_zero_centered_gamma = True                             
+                        elif normalization == 'rmsnorm':
+                            transformer_config_normalization = 'RMSNorm'
+                            
                         conf = TransformerConfig(
-                            num_layers=nemo_model_config.get('num_layers'),
-                            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
-                            num_attention_heads=nemo_model_config.get('num_attention_heads'),
-                            num_query_groups=nemo_model_config.get(
-                                'num_query_groups', nemo_model_config['num_attention_heads']
+                            num_layers = nemo_model_config.get('num_layers'),
+                            moe_router_topk = nemo_model_config.get('moe_router_topk', 0),
+                            num_attention_heads = nemo_model_config.get('num_attention_heads'),
+                            num_query_groups = nemo_model_config.get(
+                            'num_query_groups', nemo_model_config['num_attention_heads']
                             ),
-                            kv_channels=nemo_model_config.get("kv_channels", None),
-                            hidden_size=nemo_model_config.get('hidden_size'),
-                            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
-                            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
-                            add_bias_linear=nemo_model_config.get('bias'),
-                            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
+                            kv_channels = nemo_model_config.get("kv_channels", None),
+                            hidden_size = nemo_model_config.get('hidden_size'),
+                            ffn_hidden_size = nemo_model_config.get('ffn_hidden_size'),
+                            layernorm_epsilon = nemo_model_config.get('layernorm_epsilon'),
+                            add_bias_linear = nemo_model_config.get('bias'),
+                            num_moe_experts = nemo_model_config.get('num_moe_experts', None),     
+                            normalization = transformer_config_normalization, 
+                            layernorm_zero_centered_gamma = layernorm_zero_centered_gamma
                         )
 
                         return conf
 
-                    # We build the transformer config using the nemo model config.
+                    # We build the transformer config using the nemo model config. 
                     transformer_config = get_transformer_config(model_configs)
                     input_model_type = getattr(ModelType, model_type)
 
@@ -360,25 +370,21 @@ def get_transformer_config(nemo_model_config):
                     nemo_model_conversion_dict = {
                         f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
                     }
-
+                    
                     trtllm_helper = TRTLLMHelper(
-                        transformer_config=transformer_config,
-                        model_type=input_model_type,
-                        trtllm_conversion_dict=nemo_model_conversion_dict,
-                        position_embedding_type=model_configs.get('position_embedding_type'),
-                        max_position_embeddings=model_configs.get('max_position_embeddings'),
-                        rotary_percentage=model_configs.get('rotary_percentage', 1.0),
-                        rotary_base=model_configs.get('rotary_base', 10000),
-                        moe_tp_mode=model_configs.get('moe_tp_mode', 2),
-                        multi_query_mode=model_configs.get("multi_query_mode", False),
-                        activation=model_configs.get('activation', "gelu"),
-                        seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"),
-                        moe_renorm_mode=model_configs.get(
-                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
-                        ),
-                        share_embeddings_and_output_weights=model_configs.get(
-                            "share_embeddings_and_output_weights", False
-                        ),
+                        transformer_config = transformer_config, 
+                        model_type = input_model_type, 
+                        trtllm_conversion_dict = nemo_model_conversion_dict,
+                        position_embedding_type = model_configs.get('position_embedding_type'),
+                        max_position_embeddings = model_configs.get('max_position_embeddings'),
+                        rotary_percentage = model_configs.get('rotary_percentage', 1.0),
+                        rotary_base = model_configs.get('rotary_base', 10000),
+                        moe_tp_mode = model_configs.get('moe_tp_mode', 2),
+                        multi_query_mode = model_configs.get("multi_query_mode", False),
+                        activation = model_configs.get('activation', "gelu"),
+                        seq_len_interpolation_factor = model_configs.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode = model_configs.get('moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE),
+                        share_embeddings_and_output_weights = model_configs.get("share_embeddings_and_output_weights", False),
                     )
 
                     input_dtype = getattr(DataType, dtype)
@@ -388,7 +394,7 @@ def get_transformer_config(nemo_model_config):
                         use_parallel_embedding,
                         use_embedding_sharing,
                     )
-
+                    
                     trtllm_model_weights_list, trtllm_model_config_list = (
                         trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
                             model_state_dict=model,
@@ -432,7 +438,7 @@ def get_transformer_config(nemo_model_config):
 
                     if model_type == "mixtral":
                         model_type = "llama"
-
+ 
                     weights_dicts, model_configs = model_to_trtllm_ckpt(
                         model=model,
                         nemo_model_config=model_configs,
@@ -446,8 +452,8 @@ def get_transformer_config(nemo_model_config):
                         use_embedding_sharing=use_embedding_sharing,
                         fp8_quantized=fp8_quantized,
                         fp8_kvcache=fp8_kvcache,
-                    )
-
+                    )                    
+                    
                     for weight_dict, model_config in zip(weights_dicts, model_configs):
                         build_and_save_engine(
                             max_input_len=max_input_len,

From 28a0eb5dbb82e02abca0ee2e30c14892a13f15fb Mon Sep 17 00:00:00 2001
From: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:56:53 +0000
Subject: [PATCH 30/31] Apply isort and black reformatting

Signed-off-by: shanmugamr1992 <shanmugamr1992@users.noreply.github.com>
---
 nemo/export/tensorrt_llm.py | 74 +++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index e539983282ed..34f17793944a 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -324,12 +324,12 @@ def export(
                 if use_mcore_path:
                     from megatron.core.export.data_type import DataType
                     from megatron.core.export.export_config import ExportConfig
-                    from megatron.core.transformer.transformer_config import TransformerConfig
                     from megatron.core.export.model_type import ModelType
                     from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
                         DEFAULT_CONVERSION_DICT,
                     )
                     from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+                    from megatron.core.transformer.transformer_config import TransformerConfig
                     from tensorrt_llm.layers import MoeConfig
 
                     def get_transformer_config(nemo_model_config):
@@ -337,30 +337,30 @@ def get_transformer_config(nemo_model_config):
                         transformer_config_normalization = 'LayerNorm'
                         layernorm_zero_centered_gamma = False
                         if normalization == 'layernorm1p':
-                            layernorm_zero_centered_gamma = True                             
+                            layernorm_zero_centered_gamma = True
                         elif normalization == 'rmsnorm':
                             transformer_config_normalization = 'RMSNorm'
-                            
+
                         conf = TransformerConfig(
-                            num_layers = nemo_model_config.get('num_layers'),
-                            moe_router_topk = nemo_model_config.get('moe_router_topk', 0),
-                            num_attention_heads = nemo_model_config.get('num_attention_heads'),
-                            num_query_groups = nemo_model_config.get(
-                            'num_query_groups', nemo_model_config['num_attention_heads']
+                            num_layers=nemo_model_config.get('num_layers'),
+                            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
+                            num_attention_heads=nemo_model_config.get('num_attention_heads'),
+                            num_query_groups=nemo_model_config.get(
+                                'num_query_groups', nemo_model_config['num_attention_heads']
                             ),
-                            kv_channels = nemo_model_config.get("kv_channels", None),
-                            hidden_size = nemo_model_config.get('hidden_size'),
-                            ffn_hidden_size = nemo_model_config.get('ffn_hidden_size'),
-                            layernorm_epsilon = nemo_model_config.get('layernorm_epsilon'),
-                            add_bias_linear = nemo_model_config.get('bias'),
-                            num_moe_experts = nemo_model_config.get('num_moe_experts', None),     
-                            normalization = transformer_config_normalization, 
-                            layernorm_zero_centered_gamma = layernorm_zero_centered_gamma
+                            kv_channels=nemo_model_config.get("kv_channels", None),
+                            hidden_size=nemo_model_config.get('hidden_size'),
+                            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
+                            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
+                            add_bias_linear=nemo_model_config.get('bias'),
+                            num_moe_experts=nemo_model_config.get('num_moe_experts', None),
+                            normalization=transformer_config_normalization,
+                            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
                         )
 
                         return conf
 
-                    # We build the transformer config using the nemo model config. 
+                    # We build the transformer config using the nemo model config.
                     transformer_config = get_transformer_config(model_configs)
                     input_model_type = getattr(ModelType, model_type)
 
@@ -370,21 +370,25 @@ def get_transformer_config(nemo_model_config):
                     nemo_model_conversion_dict = {
                         f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
                     }
-                    
+
                     trtllm_helper = TRTLLMHelper(
-                        transformer_config = transformer_config, 
-                        model_type = input_model_type, 
-                        trtllm_conversion_dict = nemo_model_conversion_dict,
-                        position_embedding_type = model_configs.get('position_embedding_type'),
-                        max_position_embeddings = model_configs.get('max_position_embeddings'),
-                        rotary_percentage = model_configs.get('rotary_percentage', 1.0),
-                        rotary_base = model_configs.get('rotary_base', 10000),
-                        moe_tp_mode = model_configs.get('moe_tp_mode', 2),
-                        multi_query_mode = model_configs.get("multi_query_mode", False),
-                        activation = model_configs.get('activation', "gelu"),
-                        seq_len_interpolation_factor = model_configs.get("seq_len_interpolation_factor"),
-                        moe_renorm_mode = model_configs.get('moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE),
-                        share_embeddings_and_output_weights = model_configs.get("share_embeddings_and_output_weights", False),
+                        transformer_config=transformer_config,
+                        model_type=input_model_type,
+                        trtllm_conversion_dict=nemo_model_conversion_dict,
+                        position_embedding_type=model_configs.get('position_embedding_type'),
+                        max_position_embeddings=model_configs.get('max_position_embeddings'),
+                        rotary_percentage=model_configs.get('rotary_percentage', 1.0),
+                        rotary_base=model_configs.get('rotary_base', 10000),
+                        moe_tp_mode=model_configs.get('moe_tp_mode', 2),
+                        multi_query_mode=model_configs.get("multi_query_mode", False),
+                        activation=model_configs.get('activation', "gelu"),
+                        seq_len_interpolation_factor=model_configs.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode=model_configs.get(
+                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                        ),
+                        share_embeddings_and_output_weights=model_configs.get(
+                            "share_embeddings_and_output_weights", False
+                        ),
                     )
 
                     input_dtype = getattr(DataType, dtype)
@@ -394,7 +398,7 @@ def get_transformer_config(nemo_model_config):
                         use_parallel_embedding,
                         use_embedding_sharing,
                     )
-                    
+
                     trtllm_model_weights_list, trtllm_model_config_list = (
                         trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
                             model_state_dict=model,
@@ -438,7 +442,7 @@ def get_transformer_config(nemo_model_config):
 
                     if model_type == "mixtral":
                         model_type = "llama"
- 
+
                     weights_dicts, model_configs = model_to_trtllm_ckpt(
                         model=model,
                         nemo_model_config=model_configs,
@@ -452,8 +456,8 @@ def get_transformer_config(nemo_model_config):
                         use_embedding_sharing=use_embedding_sharing,
                         fp8_quantized=fp8_quantized,
                         fp8_kvcache=fp8_kvcache,
-                    )                    
-                    
+                    )
+
                     for weight_dict, model_config in zip(weights_dicts, model_configs):
                         build_and_save_engine(
                             max_input_len=max_input_len,

From 68c635e3e58e278d904bb29a958af34a9fbf5281 Mon Sep 17 00:00:00 2001
From: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
Date: Fri, 27 Sep 2024 10:45:00 -0700
Subject: [PATCH 31/31] Update Dockerfile.ci

Signed-off-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
---
 Dockerfile.ci | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index 59889f3ffd04..85da33c69617 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -58,7 +58,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.17.0
-ARG MCORE_TAG=8e69382660831b713074aa9b50356c0f23c84c92
+ARG MCORE_TAG=b92645d3ca3bb14aa124ff8a3d54c6f68538edad
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \