diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 3e2d63285ec4..bd80e88ee964 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -59,7 +59,7 @@ jobs:
             (  
               set -e
 
-              docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
+              docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
             ) 2> >(tee err.log)
 
             EXIT_CODE=$?
@@ -73,4 +73,4 @@ jobs:
         - name: after_script
           if: always() && inputs.AFTER_SCRIPT != ':'
           run: |
-            docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
\ No newline at end of file
+            docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
\ No newline at end of file
diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 022399ec4e5b..1970c26fa41c 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -92,7 +92,7 @@ jobs:
                 -H "Authorization: Bearer $GH_TOKEN" \
                 -H "X-GitHub-Api-Version: 2022-11-28" \
                 https://api.github.com/repos/NVIDIA/NeMo/pulls \
-                -d $PAYLOAD)
+                -d "$PAYLOAD")
                 
               NEW_PR_ID=$(echo -E $NEW_PR | jq '.number')
               curl -L \
@@ -120,7 +120,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Hey @'$USERNAME': Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: @okoenig"
+                      "text": ":alert: Cherrypick bot 🤖: Hey <@'$USERNAME'>: Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: <@${{ secrets.SLACK_WEBHOOK_ADMIN }}>"
                     }
                   }
                 ]
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 74ae8d57b738..20c0df66c005 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -5223,6 +5223,36 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /home/TestData/nlp/megatron_mamba/nemo-ux-mamba/cicd_test_sft/${{ github.run_id }}
 
+  L2_NeMo_2_HF_MODEL_IMPORT:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+
+        python tests/collections/llm/gpt/model/test_model_import.py
+
+      AFTER_SCRIPT: |
+        rm -rf ~/.cache/nemo/models
+
+  L2_NeMo_2_T5_Pretraining:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_T5_Pretraining') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
+        --devices=2 \
+        --max-steps=3 \
+        --experiment-dir=tests/collections/llm/t5_pretrain_results \
+        --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
+        --index-mapping-dir=tests/collections/llm/t5_index_mappings
+
+      AFTER_SCRIPT: |
+        rm -rf tests/collections/llm/t5_pretrain_results
+        rm -rf tests/collections/llm/t5_index_mappings
+
   Nemo_CICD_Test:
     needs: 
       - pre-flight
@@ -5357,8 +5387,10 @@ jobs:
       #- OPTIONAL_L2_Stable_Diffusion_Training
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
+      - L2_NeMo_2_HF_MODEL_IMPORT
       - L2_NeMo_2_SSM_Pretraining
       - L2_NeMo_2_SSM_Finetuning
+      - L2_NeMo_2_T5_Pretraining
     if: always()
     runs-on: ubuntu-latest
     steps:  
@@ -5377,6 +5409,23 @@ jobs:
       - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
         run: exit 0
 
+      - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' && github.event_name == 'pull_request' }}
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          issue-number: ${{ github.event.number }}
+          body: |
+            [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
+            
+            I just wanted to let you know that, you know, a [CICD pipeline](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) for this PR just finished successfully ✨
+
+            So it might be time to merge this PR or like to get some approvals 🚀
+
+            But I'm just a 🤖 so I'll leave it you what to do next.
+
+            Have a great day! 
+
+            //cc @ko3n1g
+
       - if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' }}
         name: Checkout repository
         uses: actions/checkout@v4
@@ -5452,4 +5501,3 @@ jobs:
       - if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
         run: |
           exit 1
-
diff --git a/README.md b/README.md
index 9b019d3ac175..f229cfb637ea 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,16 @@
 
 <details open>
   <summary><b>Speech Recognition</b></summary>
+  <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/accelerating-leaderboard-topping-asr-models-10x-with-nvidia-nemo/">
+          Accelerating Leaderboard-Topping ASR Models 10x with NVIDIA NeMo
+        </a> (2024/09/24)
+      </summary>
+      NVIDIA NeMo team released a number of inference optimizations for CTC, RNN-T, and TDT models that resulted in up to 10x inference speed-up. 
+      These models now exceed an inverse real-time factor (RTFx) of 2,000, with some reaching RTFx of even 6,000.
+      <br><br>
+    </details>
     <details>
       <summary>
         <a href="https://developer.nvidia.com/blog/new-standard-for-speech-recognition-and-translation-from-the-nvidia-nemo-canary-model/">
diff --git a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
index 476d8ff70786..55852ee3ba8f 100644
--- a/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
+++ b/examples/asr/asr_cache_aware_streaming/speech_to_text_cache_aware_streaming_infer.py
@@ -127,7 +127,7 @@ def perform_streaming(
         # would pass the whole audio at once through the model like offline mode in order to compare the results with the stremaing mode
         # the output of the model in the offline and streaming mode should be exactly the same
         with torch.inference_mode():
-            with autocast():
+            with autocast:
                 processed_signal, processed_signal_length = streaming_buffer.get_all_audios()
                 with torch.no_grad():
                     (
@@ -156,7 +156,7 @@ def perform_streaming(
     pred_out_stream = None
     for step_num, (chunk_audio, chunk_lengths) in enumerate(streaming_buffer_iter):
         with torch.inference_mode():
-            with autocast():
+            with autocast:
                 # keep_all_outputs needs to be True for the last step of streaming when model is trained with att_context_style=regular
                 # otherwise the last outputs would get dropped
 
@@ -313,19 +313,7 @@ def main():
             raise ValueError("Model does not support multiple lookaheads.")
 
     global autocast
-    if (
-        args.use_amp
-        and torch.cuda.is_available()
-        and hasattr(torch.cuda, 'amp')
-        and hasattr(torch.cuda.amp, 'autocast')
-    ):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast():
-            yield
+    autocast = torch.amp.autocast(asr_model.device.type, enabled=args.use_amp)
 
     # configure the decoding config
     decoding_cfg = asr_model.cfg.decoding
diff --git a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
index 39b7547923cd..0195c1edd239 100644
--- a/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
+++ b/examples/asr/asr_chunked_inference/aed/speech_to_text_aed_chunked_infer.py
@@ -170,16 +170,6 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
     # Disable config overwriting
     OmegaConf.set_struct(model_cfg.preprocessor, True)
 
-    # setup AMP (optional)
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast(*args, **kwargs):
-            yield
-
     # Compute output filename
     cfg = compute_output_filename(cfg, model_name)
 
@@ -208,7 +198,7 @@ def autocast(*args, **kwargs):
 
     amp_dtype = torch.float16 if cfg.amp_dtype == "float16" else torch.bfloat16
 
-    with autocast(dtype=amp_dtype):
+    with torch.amp.autocast(asr_model.device.type, enabled=cfg.amp, dtype=amp_dtype):
         with torch.no_grad():
             hyps = get_buffered_pred_feat_multitaskAED(
                 frame_asr,
diff --git a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
index 1d01e8e6a7a1..3feef6a027b8 100644
--- a/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
+++ b/examples/asr/asr_chunked_inference/ctc/speech_to_text_buffered_infer_ctc.py
@@ -88,7 +88,9 @@ class TranscriptionConfig:
     # Chunked configs
     chunk_len_in_secs: float = 1.6  # Chunk length in seconds
     total_buffer_in_secs: float = 4.0  # Length of buffer (chunk + left and right padding) in seconds
-    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    model_stride: int = (
+        8  # Model downsampling factor, 8 for Citrinet and FasConformer models and 4 for Conformer models.
+    )
 
     # Decoding strategy for CTC models
     decoding: CTCDecodingConfig = CTCDecodingConfig()
@@ -163,16 +165,6 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
     # Disable config overwriting
     OmegaConf.set_struct(model_cfg.preprocessor, True)
 
-    # setup AMP (optional)
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast():
-            yield
-
     # Compute output filename
     cfg = compute_output_filename(cfg, model_name)
 
@@ -214,20 +206,24 @@ def autocast():
     logging.info(f"tokens_per_chunk is {tokens_per_chunk}, mid_delay is {mid_delay}")
 
     frame_asr = FrameBatchASR(
-        asr_model=asr_model, frame_len=chunk_len, total_buffer=cfg.total_buffer_in_secs, batch_size=cfg.batch_size,
+        asr_model=asr_model,
+        frame_len=chunk_len,
+        total_buffer=cfg.total_buffer_in_secs,
+        batch_size=cfg.batch_size,
     )
 
-    hyps = get_buffered_pred_feat(
-        frame_asr,
-        chunk_len,
-        tokens_per_chunk,
-        mid_delay,
-        model_cfg.preprocessor,
-        model_stride_in_secs,
-        asr_model.device,
-        manifest,
-        filepaths,
-    )
+    with torch.amp.autocast(asr_model.device.type, enabled=cfg.amp):
+        hyps = get_buffered_pred_feat(
+            frame_asr,
+            chunk_len,
+            tokens_per_chunk,
+            mid_delay,
+            model_cfg.preprocessor,
+            model_stride_in_secs,
+            asr_model.device,
+            manifest,
+            filepaths,
+        )
     output_filename, pred_text_attr_name = write_transcription(
         hyps, cfg, model_name, filepaths=filepaths, compute_langs=False, compute_timestamps=False
     )
diff --git a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
index ea82796eab39..2014d8782bca 100644
--- a/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
+++ b/examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
@@ -84,8 +84,6 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
-can_gpu = torch.cuda.is_available()
-
 
 @dataclass
 class TranscriptionConfig:
@@ -112,7 +110,9 @@ class TranscriptionConfig:
     # Chunked configs
     chunk_len_in_secs: float = 1.6  # Chunk length in seconds
     total_buffer_in_secs: float = 4.0  # Length of buffer (chunk + left and right padding) in seconds
-    model_stride: int = 8  # Model downsampling factor, 8 for Citrinet and FastConformer models and 4 for Conformer models.
+    model_stride: int = (
+        8  # Model downsampling factor, 8 for Citrinet and FastConformer models and 4 for Conformer models.
+    )
 
     # Set `cuda` to int to define CUDA device. If 'None', will look for CUDA
     # device anyway, and do inference on CPU only if CUDA device is not found.
@@ -274,6 +274,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
         batch_size=cfg.batch_size,
         manifest=manifest,
         filepaths=filepaths,
+        accelerator=accelerator,
     )
 
     output_filename, pred_text_attr_name = write_transcription(
diff --git a/examples/asr/asr_vad/speech_to_text_with_vad.py b/examples/asr/asr_vad/speech_to_text_with_vad.py
index 391f299aa441..27ca7bc1f84c 100644
--- a/examples/asr/asr_vad/speech_to_text_with_vad.py
+++ b/examples/asr/asr_vad/speech_to_text_with_vad.py
@@ -63,6 +63,7 @@
 from typing import Callable, Optional
 
 import torch
+import torch.amp
 import yaml
 from omegaconf import DictConfig, OmegaConf
 from torch.profiler import ProfilerActivity, profile, record_function
@@ -84,14 +85,6 @@
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-
-    @contextlib.contextmanager
-    def autocast(enabled=None):
-        yield
-
 
 @dataclass
 class InferenceConfig:
@@ -105,9 +98,9 @@ class InferenceConfig:
     use_rttm: bool = True  # whether to use RTTM
     rttm_mode: str = "mask"  # how to use RTTM files, choices=[`mask`, `drop`]
     feat_mask_val: Optional[float] = None  # value used to mask features based on RTTM, set None to use defaults
-    normalize: Optional[
-        str
-    ] = "post_norm"  # whether and where to normalize audio feature, choices=[None, `pre_norm`, `post_norm`]
+    normalize: Optional[str] = (
+        "post_norm"  # whether and where to normalize audio feature, choices=[None, `pre_norm`, `post_norm`]
+    )
     normalize_type: str = "per_feature"  # how to determine mean and std used for normalization
     normalize_audio_db: Optional[float] = None  # set to normalize RMS DB of audio before extracting audio features
 
@@ -117,7 +110,9 @@ class InferenceConfig:
     batch_size: int = 1  # batch size for ASR. Feature extraction and VAD only support single sample per batch.
     num_workers: int = 8
     sample_rate: int = 16000
-    frame_unit_time_secs: float = 0.01  # unit time per frame in seconds, equal to `window_stride` in ASR configs, typically 10ms.
+    frame_unit_time_secs: float = (
+        0.01  # unit time per frame in seconds, equal to `window_stride` in ASR configs, typically 10ms.
+    )
     audio_type: str = "wav"
 
     # Output settings, no need to change
@@ -263,7 +258,9 @@ def extract_audio_features(manifest_filepath: str, cfg: DictConfig, record_fn: C
             'vad_stream': False,
             'sample_rate': cfg.sample_rate,
             'manifest_filepath': manifest_filepath,
-            'labels': ['infer',],
+            'labels': [
+                'infer',
+            ],
             'num_workers': cfg.num_workers,
             'shuffle': False,
             'normalize_audio_db': cfg.normalize_audio_db,
@@ -274,10 +271,11 @@ def extract_audio_features(manifest_filepath: str, cfg: DictConfig, record_fn: C
     with record_fn("feat_extract_loop"):
         for i, test_batch in enumerate(tqdm(vad_model.test_dataloader(), total=len(vad_model.test_dataloader()))):
             test_batch = [x.to(vad_model.device) for x in test_batch]
-            with autocast():
+            with torch.amp.autocast(vad_model.device.type):
                 with record_fn("feat_extract_infer"):
                     processed_signal, processed_signal_length = vad_model.preprocessor(
-                        input_signal=test_batch[0], length=test_batch[1],
+                        input_signal=test_batch[0],
+                        length=test_batch[1],
                     )
                 with record_fn("feat_extract_other"):
                     processed_signal = processed_signal.squeeze(0)[:, :processed_signal_length]
@@ -317,7 +315,9 @@ def run_vad_inference(manifest_filepath: str, cfg: DictConfig, record_fn: Callab
     test_data_config = {
         'vad_stream': True,
         'manifest_filepath': manifest_filepath,
-        'labels': ['infer',],
+        'labels': [
+            'infer',
+        ],
         'num_workers': cfg.num_workers,
         'shuffle': False,
         'window_length_in_sec': vad_cfg.vad.parameters.window_length_in_sec,
@@ -438,7 +438,7 @@ def generate_vad_frame_pred(
     with record_fn("vad_infer_loop"):
         for i, test_batch in enumerate(tqdm(vad_model.test_dataloader(), total=len(vad_model.test_dataloader()))):
             test_batch = [x.to(vad_model.device) for x in test_batch]
-            with autocast():
+            with torch.amp.autocast(vad_model.device.type):
                 with record_fn("vad_infer_model"):
                     if use_feat:
                         log_probs = vad_model(processed_signal=test_batch[0], processed_signal_length=test_batch[1])
@@ -572,7 +572,7 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str:
     hypotheses = []
     all_hypotheses = []
     t0 = time.time()
-    with autocast():
+    with torch.amp.autocast(asr_model.device.type):
         with torch.no_grad():
             with record_fn("asr_infer_loop"):
                 for test_batch in tqdm(dataloader, desc="Transcribing"):
@@ -585,7 +585,11 @@ def run_asr_inference(manifest_filepath, cfg, record_fn) -> str:
                     with record_fn("asr_infer_other"):
                         logits, logits_len = outputs[0], outputs[1]
 
-                        current_hypotheses, all_hyp = decode_function(logits, logits_len, return_hypotheses=False,)
+                        current_hypotheses, all_hyp = decode_function(
+                            logits,
+                            logits_len,
+                            return_hypotheses=False,
+                        )
                         if isinstance(current_hypotheses, tuple) and len(current_hypotheses) == 2:
                             current_hypotheses = current_hypotheses[0]  # handle RNNT output
 
diff --git a/examples/asr/experimental/sclite/speech_to_text_sclite.py b/examples/asr/experimental/sclite/speech_to_text_sclite.py
index 80a47585e000..ffbf629b3ed3 100644
--- a/examples/asr/experimental/sclite/speech_to_text_sclite.py
+++ b/examples/asr/experimental/sclite/speech_to_text_sclite.py
@@ -42,15 +42,6 @@
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest
 from nemo.utils import logging
 
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-    from contextlib import contextmanager
-
-    @contextmanager
-    def autocast(enabled=None):
-        yield
-
 
 def score_with_sctk(sctk_dir, ref_fname, hyp_fname, out_dir, glm=""):
     sclite_path = os.path.join(sctk_dir, "bin", "sclite")
@@ -91,7 +82,11 @@ def get_utt_info(manifest_path):
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=False, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=False,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
     parser.add_argument("--batch_size", type=int, default=4)
@@ -123,7 +118,7 @@ def main():
     references = [data['text'] for data in manifest_data]
     audio_filepaths = [data['audio_filepath'] for data in manifest_data]
 
-    with autocast():
+    with torch.amp.autocast(asr_model.device.type):
         hypotheses = asr_model.transcribe(audio_filepaths, batch_size=args.batch_size)
 
         # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis
diff --git a/examples/asr/quantization/speech_to_text_calibrate.py b/examples/asr/quantization/speech_to_text_calibrate.py
index 264806c7b1ba..f5ec6e76fa27 100644
--- a/examples/asr/quantization/speech_to_text_calibrate.py
+++ b/examples/asr/quantization/speech_to_text_calibrate.py
@@ -35,23 +35,17 @@
         "https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization."
     )
 
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-    from contextlib import contextmanager
-
-    @contextmanager
-    def autocast(enabled=None):
-        yield
-
-
 can_gpu = torch.cuda.is_available()
 
 
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=True,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
     parser.add_argument("--batch_size", type=int, default=256)
@@ -118,11 +112,8 @@ def main():
     for i, test_batch in enumerate(asr_model.test_dataloader()):
         if can_gpu:
             test_batch = [x.cuda() for x in test_batch]
-        if args.amp:
-            with autocast():
+            with torch.amp.autocast(asr_model.device.type, enabled=args.amp):
                 _ = asr_model(input_signal=test_batch[0], input_signal_length=test_batch[1])
-        else:
-            _ = asr_model(input_signal=test_batch[0], input_signal_length=test_batch[1])
         if i >= args.num_calib_batch:
             break
 
diff --git a/examples/asr/quantization/speech_to_text_quant_infer.py b/examples/asr/quantization/speech_to_text_quant_infer.py
index 029623cb90f0..b428db1ed83d 100644
--- a/examples/asr/quantization/speech_to_text_quant_infer.py
+++ b/examples/asr/quantization/speech_to_text_quant_infer.py
@@ -38,23 +38,17 @@
     )
 
 
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-    from contextlib import contextmanager
-
-    @contextmanager
-    def autocast(enabled=None):
-        yield
-
-
 can_gpu = torch.cuda.is_available()
 
 
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=True,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data")
     parser.add_argument("--wer_target", type=float, default=None, help="used by test")
@@ -199,7 +193,7 @@ def evaluate(asr_model, labels_map, wer):
     for test_batch in asr_model.test_dataloader():
         if can_gpu:
             test_batch = [x.cuda() for x in test_batch]
-        with autocast():
+        with torch.amp.autocast(asr_model.device.type):
             log_probs, encoded_len, greedy_predictions = asr_model(
                 input_signal=test_batch[0], input_signal_length=test_batch[1]
             )
diff --git a/examples/asr/quantization/speech_to_text_quant_infer_trt.py b/examples/asr/quantization/speech_to_text_quant_infer_trt.py
index e9916d6e7449..3fb982002c0c 100644
--- a/examples/asr/quantization/speech_to_text_quant_infer_trt.py
+++ b/examples/asr/quantization/speech_to_text_quant_infer_trt.py
@@ -43,20 +43,15 @@
 
 can_gpu = torch.cuda.is_available()
 
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-    from contextlib import contextmanager
-
-    @contextmanager
-    def autocast(enabled=None):
-        yield
-
 
 def main():
     parser = ArgumentParser()
     parser.add_argument(
-        "--asr_model", type=str, default="QuartzNet15x5Base-En", required=True, help="Pass: 'QuartzNet15x5Base-En'",
+        "--asr_model",
+        type=str,
+        default="QuartzNet15x5Base-En",
+        required=True,
+        help="Pass: 'QuartzNet15x5Base-En'",
     )
     parser.add_argument(
         "--asr_onnx",
@@ -145,9 +140,11 @@ def build_trt_engine(asr_model, onnx_path, qat):
         network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
         if qat:
             network_flags |= 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_PRECISION)
-        with builder.create_network(flags=network_flags) as network, trt.OnnxParser(
-            network, TRT_LOGGER
-        ) as parser, builder.create_builder_config() as builder_config:
+        with (
+            builder.create_network(flags=network_flags) as network,
+            trt.OnnxParser(network, TRT_LOGGER) as parser,
+            builder.create_builder_config() as builder_config,
+        ):
             parser.parse_from_file(onnx_path)
             builder_config.max_workspace_size = workspace_size * (1024 * 1024)
             if qat:
diff --git a/examples/asr/speech_translation/translate_speech.py b/examples/asr/speech_translation/translate_speech.py
index 203852b52ee9..42394001255f 100644
--- a/examples/asr/speech_translation/translate_speech.py
+++ b/examples/asr/speech_translation/translate_speech.py
@@ -162,16 +162,6 @@ def main(cfg: TranslationConfig) -> Union[TranslationConfig, List[str]]:
     # prepare audio filepaths and decide wether it's partial audio
     filepaths, partial_audio = prepare_audio_data(cfg)
 
-    # setup AMP (optional)
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast():
-            yield
-
     # Compute output filename
     cfg = compute_output_filename(cfg, model_name)
 
@@ -184,10 +174,12 @@ def autocast():
         return cfg
 
     # translate audio
-    with autocast():
+    with torch.amp.autocast(asr_model.device.type, enabled=cfg.amp):
         with torch.no_grad():
             translations = asr_model.translate(
-                paths2audio_files=filepaths, batch_size=cfg.batch_size, return_hypotheses=return_hypotheses,
+                paths2audio_files=filepaths,
+                batch_size=cfg.batch_size,
+                return_hypotheses=return_hypotheses,
             )
 
     logging.info(f"Finished translating {len(filepaths)} files !")
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
index a8df6bc5a911..f3a1c3fc8162 100644
--- a/examples/asr/transcribe_speech.py
+++ b/examples/asr/transcribe_speech.py
@@ -358,16 +358,6 @@ def main(cfg: TranscriptionConfig) -> Union[TranscriptionConfig, List[Hypothesis
 
     filepaths = sorted_manifest_path if sorted_manifest_path is not None else filepaths
 
-    # setup AMP (optional)
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast(dtype=None, enabled=True):
-            yield
-
     # Compute output filename
     cfg = compute_output_filename(cfg, model_name)
 
@@ -393,7 +383,7 @@ def autocast(dtype=None, enabled=True):
                     )
                 total_duration += item["duration"]
 
-    with autocast(dtype=amp_dtype, enabled=cfg.amp):
+    with torch.amp.autocast('cuda' if torch.cuda.is_available() else 'cpu', dtype=amp_dtype, enabled=cfg.amp):
         with torch.no_grad():
             if cfg.calculate_rtfx:
                 start_time = time.time()
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_export.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_export.yaml
index 1ab9bdbd6398..23b94332d230 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_export.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_export.yaml
@@ -12,5 +12,6 @@ model:
   type: neva #neva, video-neva, lita, vila, vita
   precision: bfloat16
   visual_model_path: /path/to/visual.nemo
-  llm_model_path: /path/to/llm.nemo
-  llm_model_type: llama 
+  llm_model_path: null
+  llm_model_type: llama
+  lora_path: null
diff --git a/examples/multimodal/multimodal_llm/neva/conf/neva_trt_infer.yaml b/examples/multimodal/multimodal_llm/neva/conf/neva_trt_infer.yaml
index 14e6f98c0676..6ddc73875f4a 100644
--- a/examples/multimodal/multimodal_llm/neva/conf/neva_trt_infer.yaml
+++ b/examples/multimodal/multimodal_llm/neva/conf/neva_trt_infer.yaml
@@ -10,3 +10,4 @@ infer:
   repetition_penalty: 1.0  # The parameter for repetition penalty. 1.0 means no penalty.
   num_beams: 1
   max_new_tokens: 30
+  lora_uids: null
diff --git a/examples/multimodal/multimodal_llm/neva/neva_export.py b/examples/multimodal/multimodal_llm/neva/neva_export.py
index 6cf44084a564..b6710a06d32f 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_export.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_export.py
@@ -31,6 +31,8 @@ def main(cfg):
         max_batch_size=cfg.infer.max_batch_size,
         max_multimodal_len=cfg.infer.max_multimodal_len,
         dtype=cfg.model.precision,
+        lora_checkpoint_path=cfg.model.lora_path,
+        use_lora_plugin="auto" if cfg.model.lora_path is not None else None,
         load_model=False,
     )
 
diff --git a/examples/multimodal/multimodal_llm/neva/neva_trt_run.py b/examples/multimodal/multimodal_llm/neva/neva_trt_run.py
index b26d4e83432f..349c3a3f805d 100644
--- a/examples/multimodal/multimodal_llm/neva/neva_trt_run.py
+++ b/examples/multimodal/multimodal_llm/neva/neva_trt_run.py
@@ -21,6 +21,12 @@
 @hydra_runner(config_path='conf', config_name='neva_trt_infer')
 def main(cfg):
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    if cfg.infer.lora_uids is not None:
+        lora_uids = []
+        for uid in cfg.infer.lora_uids:
+            lora_uids.append(str(uid))
+    else:
+        lora_uids = None
 
     exporter = TensorRTMMExporter(cfg.engine_dir)
     output = exporter.forward(
@@ -33,6 +39,7 @@ def main(cfg):
         temperature=cfg.infer.temperature,
         repetition_penalty=cfg.infer.repetition_penalty,
         num_beams=cfg.infer.num_beams,
+        lora_uids=lora_uids,
     )
 
     print(output)
diff --git a/examples/slu/speech_intent_slot/eval_utils/inference.py b/examples/slu/speech_intent_slot/eval_utils/inference.py
index d83d48b688fc..9bd76c76822d 100644
--- a/examples/slu/speech_intent_slot/eval_utils/inference.py
+++ b/examples/slu/speech_intent_slot/eval_utils/inference.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 
-import contextlib
 import glob
 import json
 import os
@@ -60,7 +59,12 @@ class InferenceConfig:
     sequence_generator: SequenceGeneratorConfig = SequenceGeneratorConfig(type="greedy")
 
 
-def slurp_inference(model, path2manifest: str, batch_size: int = 4, num_workers: int = 0,) -> List[str]:
+def slurp_inference(
+    model,
+    path2manifest: str,
+    batch_size: int = 4,
+    num_workers: int = 0,
+) -> List[str]:
 
     if num_workers is None:
         num_workers = min(batch_size, os.cpu_count() - 1)
@@ -178,16 +182,6 @@ def run_inference(cfg: InferenceConfig) -> InferenceConfig:
 
     logging.info(f"\nStart inference with {len(filepaths)} files...\n")
 
-    # setup AMP (optional)
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast():
-            yield
-
     # Compute output filename
     if cfg.output_filename is None:
         # create default output filename
@@ -206,7 +200,7 @@ def autocast():
         return cfg
 
     # transcribe audio
-    with autocast():
+    with torch.amp.autocast(model.device.type, enabled=cfg.amp):
         with torch.no_grad():
             predictions = slurp_inference(
                 model=model,
diff --git a/nemo/collections/asr/models/clustering_diarizer.py b/nemo/collections/asr/models/clustering_diarizer.py
index 98e56a7be48d..ddcc269bedcc 100644
--- a/nemo/collections/asr/models/clustering_diarizer.py
+++ b/nemo/collections/asr/models/clustering_diarizer.py
@@ -49,15 +49,6 @@
 from nemo.core.classes import Model
 from nemo.utils import logging, model_utils
 
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-    from contextlib import contextmanager
-
-    @contextmanager
-    def autocast(enabled=None):
-        yield
-
 
 __all__ = ['ClusteringDiarizer']
 
@@ -223,7 +214,7 @@ def _run_vad(self, manifest_file):
             tqdm(self._vad_model.test_dataloader(), desc='vad', leave=True, disable=not self.verbose)
         ):
             test_batch = [x.to(self._vad_model.device) for x in test_batch]
-            with autocast():
+            with torch.amp.autocast(self._vad_model.device.type):
                 log_probs = self._vad_model(input_signal=test_batch[0], input_signal_length=test_batch[1])
                 probs = torch.softmax(log_probs, dim=-1)
                 pred = probs[:, 1]
@@ -359,7 +350,7 @@ def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: in
         ):
             test_batch = [x.to(self._speaker_model.device) for x in test_batch]
             audio_signal, audio_signal_len, labels, slices = test_batch
-            with autocast():
+            with torch.amp.autocast(self._speaker_model.device.type):
                 _, embs = self._speaker_model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)
                 emb_shape = embs.shape[-1]
                 embs = embs.view(-1, emb_shape)
diff --git a/nemo/collections/asr/parts/k2/graph_transducer.py b/nemo/collections/asr/parts/k2/graph_transducer.py
index 5de8064224a1..bcd49bcbd7a9 100644
--- a/nemo/collections/asr/parts/k2/graph_transducer.py
+++ b/nemo/collections/asr/parts/k2/graph_transducer.py
@@ -25,7 +25,7 @@
 def force_float32_context() -> ContextManager:
     """Get context manager to force float32 precision in autocast mode."""
     if torch.is_autocast_enabled():
-        return torch.cuda.amp.autocast(dtype=torch.float32)
+        return torch.amp.autocast('cuda', dtype=torch.float32)
     return nullcontext()
 
 
@@ -159,7 +159,10 @@ def get_graphs_batched(
 
             # composed version
             text_fsas = [
-                self.get_unit_schema(units_tensor=targets[i, : target_lengths[i].item()], vocab_size=vocab_size,)
+                self.get_unit_schema(
+                    units_tensor=targets[i, : target_lengths[i].item()],
+                    vocab_size=vocab_size,
+                )
                 for i in range(batch_size)
             ]
             temporal_fsas = [
@@ -192,7 +195,8 @@ def get_logits_indices(self, target_fsas_vec: k2.Fsa, logits_shape: torch.Size)
         scores_to_batch_i = torch.repeat_interleave(
             torch.arange(batch_size, device=device, dtype=torch.int64),
             torch.tensor(
-                [target_fsas_vec.arcs.index(0, i)[0].values().shape[0] for i in range(batch_size)], device=device,
+                [target_fsas_vec.arcs.index(0, i)[0].values().shape[0] for i in range(batch_size)],
+                device=device,
             ),
         )
         indices = (
@@ -442,7 +446,11 @@ def get_grid(self, units_tensor: torch.Tensor, num_frames: int, vocab_size: int)
         return rnnt_graph
 
     def forward(
-        self, acts: torch.Tensor, labels: torch.Tensor, act_lens: torch.Tensor, label_lens: torch.Tensor,
+        self,
+        acts: torch.Tensor,
+        labels: torch.Tensor,
+        act_lens: torch.Tensor,
+        label_lens: torch.Tensor,
     ) -> torch.Tensor:
         """
         Compute forward method for RNN-T.
diff --git a/nemo/collections/asr/parts/preprocessing/features.py b/nemo/collections/asr/parts/preprocessing/features.py
index d70737b5135b..5138a3148e91 100644
--- a/nemo/collections/asr/parts/preprocessing/features.py
+++ b/nemo/collections/asr/parts/preprocessing/features.py
@@ -433,7 +433,7 @@ def forward(self, x, seq_len, linear_spec=False):
             x = torch.cat((x[:, 0].unsqueeze(1), x[:, 1:] - self.preemph * x[:, :-1]), dim=1)
 
         # disable autocast to get full range of stft values
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(x.device.type, enabled=False):
             x = self.stft(x)
 
         # torch stft returns complex tensor (of shape [B,N,T]); so convert to magnitude
@@ -627,7 +627,7 @@ def _apply_log(self, features: torch.Tensor) -> torch.Tensor:
 
     def _extract_spectrograms(self, signals: torch.Tensor) -> torch.Tensor:
         # Complex FFT needs to be done in single precision
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast('cuda', enabled=False):
             features = self._mel_spec_extractor(waveform=signals)
         return features
 
diff --git a/nemo/collections/asr/parts/submodules/jasper.py b/nemo/collections/asr/parts/submodules/jasper.py
index 78f81ee555bc..ec0def1b3ebb 100644
--- a/nemo/collections/asr/parts/submodules/jasper.py
+++ b/nemo/collections/asr/parts/submodules/jasper.py
@@ -473,7 +473,7 @@ def forward_for_export(self, x, lengths):
             self.set_max_len(max_len)
         dtype = x.dtype
         # Computes in float32 to avoid instabilities during training with AMP.
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(x.device.type, enabled=False):
             # Create sample mask - 1 represents value, 0 represents pad
             mask = self.make_pad_mask(lengths, max_audio_length=max_len, device=x.device)
             mask = ~mask  # 0 represents value, 1 represents pad
diff --git a/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py b/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py
index 96f90bee363c..262c98401f95 100644
--- a/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py
+++ b/nemo/collections/asr/parts/utils/asr_confidence_benchmarking_utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import contextlib
 import copy
 import os
 from pathlib import Path
@@ -68,7 +67,7 @@ def run_confidence_benchmark(
     batch_size: int = 8,
     num_workers: int = 4,
     plot_dir: Optional[Union[str, Path]] = None,
-    autocast: Optional = None,
+    use_amp: Optional[bool] = False,
 ):
     """Run benchmark and plot histograms and curves, if plot_dir is provided.
 
@@ -81,15 +80,8 @@ def run_confidence_benchmark(
         plot_dir = Path(plot_dir)
     is_rnnt = isinstance(model, EncDecRNNTModel)
 
-    # setup autocast if necessary
-    if autocast is None:
-
-        @contextlib.contextmanager
-        def autocast():
-            yield
-
     # transcribe audio
-    with autocast():
+    with torch.amp.autocast(model.device.type, enabled=use_amp):
         with torch.no_grad():
             transcriptions = model.transcribe(
                 audio=filepaths, batch_size=batch_size, return_hypotheses=True, num_workers=num_workers
diff --git a/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py b/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py
index c39ff7da58d9..59e050c5f656 100644
--- a/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py
+++ b/nemo/collections/asr/parts/utils/decoder_timestamps_utils.py
@@ -449,7 +449,7 @@ def run_ASR_QuartzNet_CTC(self, asr_model: Type[EncDecCTCModel]) -> Tuple[Dict,
             log_prediction=asr_model._cfg.get("log_prediction", False),
         )
 
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast(asr_model.device.type):
             transcript_hyps_list = asr_model.transcribe(
                 self.audio_file_list, batch_size=self.asr_batch_size, return_hypotheses=True
             )  # type: List[nemo_asr.parts.Hypothesis]
@@ -577,7 +577,7 @@ def run_ASR_CitriNet_CTC(self, asr_model: Type[EncDecCTCModelBPE]) -> Tuple[Dict
             log_prediction=asr_model._cfg.get("log_prediction", False),
         )
 
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast(asr_model.device.type):
             transcript_hyps_list = asr_model.transcribe(
                 self.audio_file_list, batch_size=self.asr_batch_size, return_hypotheses=True
             )  # type: List[nemo_asr.parts.Hypothesis]
@@ -671,7 +671,7 @@ def run_ASR_BPE_CTC(self, asr_model: Type[EncDecCTCModelBPE]) -> Tuple[Dict, Dic
         onset_delay, mid_delay, tokens_per_chunk = self.set_buffered_infer_params(asr_model)
         onset_delay_in_sec = round(onset_delay * self.model_stride_in_secs, 2)
 
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast(asr_model.device.type):
             logging.info(f"Running ASR model {self.ASR_model_name}")
 
             for idx, audio_file_path in enumerate(self.audio_file_list):
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index cb5d21bf760a..c1e712c44aeb 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -18,7 +18,7 @@
 from dataclasses import dataclass
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from omegaconf import DictConfig
@@ -42,6 +42,7 @@ def get_buffered_pred_feat_rnnt(
     batch_size: int,
     manifest: str = None,
     filepaths: List[list] = None,
+    accelerator: Optional[str] = 'cpu',
 ) -> List[rnnt_utils.Hypothesis]:
     """
     Moved from examples/asr/asr_chunked_inference/rnnt/speech_to_text_buffered_infer_rnnt.py
@@ -67,7 +68,7 @@ def get_buffered_pred_feat_rnnt(
                     refs.append(row['text'])
 
     with torch.inference_mode():
-        with torch.cuda.amp.autocast():
+        with torch.amp.autocast('cpu' if accelerator == 'cpu' else 'cuda'):
             batch = []
             asr.sample_offset = 0
             for idx in tqdm(range(len(filepaths)), desc='Sample:', total=len(filepaths)):
diff --git a/nemo/collections/asr/parts/utils/vad_utils.py b/nemo/collections/asr/parts/utils/vad_utils.py
index 138b2e36b7fa..29b4f7b33898 100644
--- a/nemo/collections/asr/parts/utils/vad_utils.py
+++ b/nemo/collections/asr/parts/utils/vad_utils.py
@@ -40,16 +40,6 @@
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.utils import logging
 
-try:
-    from torch.cuda.amp import autocast
-except ImportError:
-    from contextlib import contextmanager
-
-    @contextmanager
-    def autocast(enabled=None):
-        yield
-
-
 """
 This file contains all the utility functions required for voice activity detection. 
 """
@@ -1127,7 +1117,7 @@ def generate_vad_frame_pred(
     status = get_vad_stream_status(data)
     for i, test_batch in enumerate(tqdm(vad_model.test_dataloader(), total=len(vad_model.test_dataloader()))):
         test_batch = [x.to(vad_model.device) for x in test_batch]
-        with autocast():
+        with torch.amp.autocast(vad_model.device.type):
             if use_feat:
                 log_probs = vad_model(processed_signal=test_batch[0], processed_signal_length=test_batch[1])
             else:
diff --git a/nemo/collections/audio/modules/masking.py b/nemo/collections/audio/modules/masking.py
index cfb575eea879..3f0380dccb5d 100644
--- a/nemo/collections/audio/modules/masking.py
+++ b/nemo/collections/audio/modules/masking.py
@@ -668,6 +668,7 @@ def forward(self, input: torch.Tensor, activity: torch.Tensor) -> torch.Tensor:
         """
         B, num_inputs, F, T = input.shape
         num_outputs = activity.size(1)
+        device = input.device.type
 
         if activity.size(0) != B:
             raise ValueError(f'Batch dimension mismatch: activity {activity.shape} vs input {input.shape}')
@@ -678,7 +679,7 @@ def forward(self, input: torch.Tensor, activity: torch.Tensor) -> torch.Tensor:
         if num_outputs == 1:
             raise ValueError(f'Expecting multiple outputs, got {num_outputs}')
 
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(device, enabled=False):
             input = input.to(dtype=self.dtype)
 
             assert input.is_complex(), f'Expecting complex input, got {input.dtype}'
@@ -1039,8 +1040,9 @@ def forward(
             shape (B, C, F, T).
         """
         io_dtype = input.dtype
+        device = input.device.type
 
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(device, enabled=False):
             output = input.to(dtype=self.dtype)
 
             if not output.is_complex():
diff --git a/nemo/collections/audio/modules/transforms.py b/nemo/collections/audio/modules/transforms.py
index 6839ae0f7598..cfa0c2c8ebb7 100644
--- a/nemo/collections/audio/modules/transforms.py
+++ b/nemo/collections/audio/modules/transforms.py
@@ -143,7 +143,7 @@ def forward(
         input = input.view(B, -1, T)
 
         # STFT output (B, C, F, N)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(input.device.type, enabled=False):
             output = self.stft(input.float())
 
             if self.magnitude_power != 1:
@@ -265,7 +265,7 @@ def forward(
         input = input.view(B, -1, T)
 
         # STFT output (B, C, F, N)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(input.device.type, enabled=False):
             output = self.stft(input.float())
 
             if self.magnitude_power != 1:
@@ -414,7 +414,7 @@ def forward(self, input: torch.Tensor, input_length: Optional[torch.Tensor] = No
         input = input.view(B, -1, F, N)
 
         # iSTFT output (B, C, T)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(input.device.type, enabled=False):
             output = input.cfloat()
 
             if self.scale != 1:
@@ -533,7 +533,7 @@ def forward(self, input: torch.Tensor, input_length: Optional[torch.Tensor] = No
         input = input.view(B, -1, F, N)
 
         # iSTFT output (B, C, T)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(input.device.type, enabled=False):
             output = input.cfloat()
 
             if self.scale != 1:
diff --git a/nemo/collections/audio/parts/submodules/multichannel.py b/nemo/collections/audio/parts/submodules/multichannel.py
index aff0f28cfc3a..0fa4f8bf238b 100644
--- a/nemo/collections/audio/parts/submodules/multichannel.py
+++ b/nemo/collections/audio/parts/submodules/multichannel.py
@@ -597,7 +597,7 @@ def forward(self, input: torch.Tensor, mask_s: torch.Tensor, mask_n: torch.Tenso
         """
         iodtype = input.dtype
 
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(self.device.type, enabled=False):
             # Convert to double
             input = input.cdouble()
             mask_s = mask_s.double()
diff --git a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
index dc0cef692ee2..76dca1268c3b 100644
--- a/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
+++ b/nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py
@@ -27,7 +27,7 @@
 
 class AutoTokenizer(TokenizerSpec):
     """
-        Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
+    Wrapper of HuggingFace AutoTokenizer https://huggingface.co/transformers/model_doc/auto.html#autotokenizer.
 
     """
 
@@ -46,15 +46,14 @@ def __init__(
         use_fast: Optional[bool] = False,
         trust_remote_code: Optional[bool] = False,
     ):
-
         """
         Args:
-            pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument. 
-                For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained. 
+            pretrained_model_name: corresponds to HuggingFace-AutoTokenizer's 'pretrained_model_name_or_path' input argument.
+                For more details please refer to https://huggingface.co/transformers/_modules/transformers/tokenization_auto.html#AutoTokenizer.from_pretrained.
                 The list of all supported models can be found here: ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
             vocab_file: path to file with vocabulary which consists
                 of characters separated by newlines.
-            mask_token: mask token 
+            mask_token: mask token
             bos_token: the beginning of sequence token
             eos_token: the end of sequence token. Usually equal to sep_token
             pad_token: token to use for padding
@@ -132,24 +131,24 @@ def __init__(
 
         if len(new_tokens_in_vocab) > 0:
             """
-            Special tokens that were not previously included in the tokenizer's vocabulary file will be added to 
+            Special tokens that were not previously included in the tokenizer's vocabulary file will be added to
             the vocabulary and, as a result, the model should be resized, for example:
-            
+
             # define your model
             pretrained_model_name = 'roberta-base'
             model = nemo_nlp.modules.get_lm_model(pretrained_model_name=pretrained_model_name)
-            
+
             # define pretrained tokenizer
             tokenizer_default = nemo_nlp.modules.get_tokenizer(tokenizer_name=pretrained_model_name)
-            
+
             special_tokens = {'bos_token': '<BOS>',
                               'cls_token': '<CSL>',
                               'additional_special_tokens': ['<MY_NER_TOKEN>', '<ANOTHER_TOKEN>']}
             tokenizer_default.add_special_tokens(special_tokens_dict=special_tokens)
-            
+
             # resize your model so that the embeddings for newly added tokens are updated during training/finetuning
             model.resize_token_embeddings(tokenizer_default.vocab_size)
-            
+
             See NLP_Tokenizers.ipynb for more details.
             """
             logging.warning(
@@ -159,6 +158,7 @@ def __init__(
             )
         self.add_special_tokens(special_tokens_dict)
         self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
+        self._inv_vocab_dict = {}
 
     @property
     def vocab_size(self):
@@ -226,6 +226,12 @@ def vocab(self):
         id2vocab = {v: k for k, v in self.tokenizer.vocab.items()}
         return [id2vocab[i] for i in range(len(id2vocab))]
 
+    @property
+    def inv_vocab(self):
+        if self._inv_vocab_dict == {}:
+            self._inv_vocab_dict = {v: k for k, v in self.tokenizer.vocab.items()}
+        return self._inv_vocab_dict
+
     @property
     def pad_id(self):
         if getattr(self, 'pad_token') is None:
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index e790e4ae84a8..bc6f4dd9201e 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -18,7 +18,6 @@
 safe_import("transformer_engine")
 
 from nemo.collections.llm import peft, tokenizer
-
 from nemo.collections.llm.gpt.data import (
     DollyDataModule,
     FineTuningDataModule,
@@ -102,7 +101,7 @@
     gpt_data_step,
     gpt_forward_step,
 )
-
+from nemo.collections.llm.t5.model import T5Config, T5Model, t5_data_step, t5_forward_step
 
 __all__ = [
     "MockDataModule",
@@ -110,6 +109,10 @@
     "GPTConfig",
     "gpt_data_step",
     "gpt_forward_step",
+    "T5Model",
+    "T5Config",
+    "t5_data_step",
+    "t5_forward_step",
     "MaskedTokenLossReduction",
     "MistralConfig7B",
     "MistralModel",
@@ -184,6 +187,7 @@
 
 try:
     import nemo_run as run
+
     from nemo.collections.llm.api import export_ckpt, finetune, import_ckpt, pretrain, train, validate
     from nemo.collections.llm.recipes import *  # noqa
 
diff --git a/nemo/collections/llm/gpt/model/baichuan.py b/nemo/collections/llm/gpt/model/baichuan.py
index 94e1f2bb6753..e193afd4016d 100644
--- a/nemo/collections/llm/gpt/model/baichuan.py
+++ b/nemo/collections/llm/gpt/model/baichuan.py
@@ -144,8 +144,10 @@ def make_vocab_size_divisible_by(vocab_size):
 class HFBaichuan2Exporter(io.ModelConnector[Baichuan2Model, "AutoModelForCausalLM"]):
     def init(self) -> "AutoModelForCausalLM":
         from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
+        with no_init_weights(True):
+            return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/gpt/model/chatglm.py b/nemo/collections/llm/gpt/model/chatglm.py
index 297a02501f5f..68e022d9c0ba 100644
--- a/nemo/collections/llm/gpt/model/chatglm.py
+++ b/nemo/collections/llm/gpt/model/chatglm.py
@@ -141,8 +141,10 @@ def config(self) -> ChatGLMConfig:
 class HFChatGLMExporter(io.ModelConnector[ChatGLMModel, "AutoModelForCausalLM"]):
     def init(self) -> "AutoModelForCausalLM":
         from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
+        with no_init_weights(True):
+            return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index 400b330b958c..a753b8617d3f 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -173,8 +173,10 @@ def make_vocab_size_divisible_by(vocab_size):
 class HFGemmaExporter(io.ModelConnector[GemmaModel, "GemmaForCausalLM"]):
     def init(self) -> "GemmaForCausalLM":
         from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return AutoModelForCausalLM.from_config(self.config)
+        with no_init_weights(True):
+            return AutoModelForCausalLM.from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index b2410c99796d..7f5d015a0abd 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -291,8 +291,10 @@ def make_vocab_size_divisible_by(vocab_size):
 class HFLlamaExporter(io.ModelConnector[LlamaModel, "LlamaForCausalLM"]):
     def init(self) -> "LlamaForCausalLM":
         from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return AutoModelForCausalLM.from_config(self.config)
+        with no_init_weights(True):
+            return AutoModelForCausalLM.from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/gpt/model/mistral.py b/nemo/collections/llm/gpt/model/mistral.py
index 0607baf5cc3f..67233d0195a7 100644
--- a/nemo/collections/llm/gpt/model/mistral.py
+++ b/nemo/collections/llm/gpt/model/mistral.py
@@ -188,8 +188,10 @@ def make_vocab_size_divisible_by(mistral_vocab_size):
 class HFMistralExporter(io.ModelConnector[MistralModel, "MistralForCausalLM"]):
     def init(self) -> "MistralForCausalLM":
         from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return AutoModelForCausalLM.from_config(self.config)
+        with no_init_weights(True):
+            return AutoModelForCausalLM.from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
         # TODO: Make it work with lazy init
@@ -339,3 +341,11 @@ def _export_linear_fc1(linear_fc1):
     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
 
     return gate_proj, up_proj
+
+
+__all__ = [
+    "MistralConfig7B",
+    "MistralNeMo2407Config12B",
+    "MistralNeMo2407Config123B",
+    "MistralModel",
+]
diff --git a/nemo/collections/llm/gpt/model/mixtral.py b/nemo/collections/llm/gpt/model/mixtral.py
index bb3dc0068ca3..23b83960a9ec 100644
--- a/nemo/collections/llm/gpt/model/mixtral.py
+++ b/nemo/collections/llm/gpt/model/mixtral.py
@@ -268,8 +268,10 @@ def _import_moe_w1_w3(gate_proj, up_proj):
 class HFMixtralExporter(io.ModelConnector[MixtralModel, "MixtralForCausalLM"]):
     def init(self) -> "MixtralForCausalLM":
         from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return AutoModelForCausalLM.from_config(self.config)
+        with no_init_weights(True):
+            return AutoModelForCausalLM.from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
         # TODO: Make it work with lazy init
@@ -383,3 +385,12 @@ def _export_moe_w1_w3(linear_fc1):
     gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
 
     return gate_proj, up_proj
+
+
+__all__ = [
+    "MixtralConfig",
+    "MixtralConfig8x3B",
+    "MixtralConfig8x7B",
+    "MixtralConfig8x22B",
+    "MixtralModel",
+]
diff --git a/nemo/collections/llm/gpt/model/nemotron.py b/nemo/collections/llm/gpt/model/nemotron.py
index 43a94b30b180..3db1fbdcbfb7 100644
--- a/nemo/collections/llm/gpt/model/nemotron.py
+++ b/nemo/collections/llm/gpt/model/nemotron.py
@@ -212,7 +212,10 @@ def make_vocab_size_divisible_by(vocab_size):
 @io.model_exporter(NemotronModel, "hf")
 class HFNemotronExporter(io.ModelConnector[NemotronModel, "NemotronForCausalLM"]):
     def init(self) -> "NemotronForCausalLM":
-        return NemotronForCausalLM.from_config(self.config)
+        from transformers.modeling_utils import no_init_weights
+
+        with no_init_weights(True):
+            return NemotronForCausalLM.from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/gpt/model/qwen2.py b/nemo/collections/llm/gpt/model/qwen2.py
index d1c977ee9d7c..ba59f874176a 100644
--- a/nemo/collections/llm/gpt/model/qwen2.py
+++ b/nemo/collections/llm/gpt/model/qwen2.py
@@ -174,8 +174,10 @@ def config(self) -> Qwen2Config:
 class HFQwen2Exporter(io.ModelConnector[Qwen2Model, "AutoModelForCausalLM"]):
     def init(self) -> "AutoModelForCausalLM":
         from transformers import AutoModelForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
+        with no_init_weights(True):
+            return AutoModelForCausalLM.from_config(self.config, trust_remote_code=True)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/gpt/model/starcoder.py b/nemo/collections/llm/gpt/model/starcoder.py
index e796f508ee58..34bff1aa613d 100644
--- a/nemo/collections/llm/gpt/model/starcoder.py
+++ b/nemo/collections/llm/gpt/model/starcoder.py
@@ -159,8 +159,10 @@ def make_vocab_size_divisible_by(vocab_size):
 class HFStarcoderExporter(io.ModelConnector[StarcoderModel, "GPTBigCodeForCausalLM"]):
     def init(self) -> "GPTBigCodeForCausalLM":
         from transformers import GPTBigCodeForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return GPTBigCodeForCausalLM._from_config(self.config)
+        with no_init_weights(True):
+            return GPTBigCodeForCausalLM._from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/gpt/model/starcoder2.py b/nemo/collections/llm/gpt/model/starcoder2.py
index 7926a92cf352..33f97a4a3e8c 100644
--- a/nemo/collections/llm/gpt/model/starcoder2.py
+++ b/nemo/collections/llm/gpt/model/starcoder2.py
@@ -184,8 +184,10 @@ def make_vocab_size_divisible_by(vocab_size):
 class HFStarcoder2Exporter(io.ModelConnector[Starcoder2Model, "Starcoder2ForCausalLM"]):
     def init(self) -> "Starcoder2ForCausalLM":
         from transformers import Starcoder2ForCausalLM
+        from transformers.modeling_utils import no_init_weights
 
-        return Starcoder2ForCausalLM._from_config(self.config)
+        with no_init_weights(True):
+            return Starcoder2ForCausalLM._from_config(self.config)
 
     def apply(self, output_path: Path) -> Path:
         target = self.init()
diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 0d2a98fa3dfb..bdd23be4b029 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -50,7 +50,7 @@ def forward(self, x):
             linear_output, bias, layernorm_output = linear_output
             x = layernorm_output
 
-        adapter_output = self.adapter(x)
+        adapter_output = self.adapter(x.contiguous())
         return linear_output + adapter_output, bias
 
 
diff --git a/nemo/collections/llm/recipes/README.md b/nemo/collections/llm/recipes/README.md
index a3cf715acffb..d56fc25a6d7f 100644
--- a/nemo/collections/llm/recipes/README.md
+++ b/nemo/collections/llm/recipes/README.md
@@ -16,10 +16,10 @@ Recipes are designed to be modular and extensible, allowing users to easily cust
 
 ### Command Line Interface
 
-You can use these recipes via the NeMo CLI:
+You can use these recipes via the NeMo CLI (provided by [NeMo-Run](https://github.com/NVIDIA/NeMo-Run)):
 
 ```bash
-nemorun llm <task> --factory <recipe_name>
+nemo llm <task> --factory <recipe_name>
 ```
 Where:
 - `<task>` is either `pretrain` or `finetune`
@@ -27,16 +27,19 @@ Where:
 
 For example:
 ```bash
-nemorun llm pretrain --factory llama3_8b
+nemo llm pretrain --factory llama3_8b
 ```
 
+> [!IMPORTANT]
+> When launching the recipes with multiple processes (i.e. on multiple GPUs), add the `-y` option to the command to avoid user confirmation prompts.
+> For example, `nemo llm pretrain --factory llama3_8b -y`
 
 ### Customizing Parameters
 
 You can override any parameter in the recipe:
 
 ```bash
-nemorun llm pretrain --factory llama3_8b trainer.max_steps=2000
+nemo llm pretrain --factory llama3_8b trainer.max_steps=2000
 ```
 
 For more details around running recipes, see [pre-train](../../../../examples/llm/pretrain/README.md).
diff --git a/nemo/collections/llm/t5/__init__.py b/nemo/collections/llm/t5/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/llm/t5/data/__init__.py b/nemo/collections/llm/t5/data/__init__.py
new file mode 100644
index 000000000000..537c12fd9115
--- /dev/null
+++ b/nemo/collections/llm/t5/data/__init__.py
@@ -0,0 +1,3 @@
+from nemo.collections.llm.t5.data.pre_training import PreTrainingDataModule
+
+__all__ = ["PreTrainingDataModule"]
diff --git a/nemo/collections/llm/t5/data/pre_training.py b/nemo/collections/llm/t5/data/pre_training.py
new file mode 100644
index 000000000000..2c73e0b78b11
--- /dev/null
+++ b/nemo/collections/llm/t5/data/pre_training.py
@@ -0,0 +1,329 @@
+import logging
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional
+
+import pytorch_lightning as pl
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils import data
+
+from nemo.lightning.data import WrappedDataLoader
+from nemo.lightning.io.mixin import IOMixin
+from nemo.lightning.pytorch.plugins import MegatronDataSampler
+
+if TYPE_CHECKING:
+    from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDatasetConfig
+
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+class PreTrainingDataModule(pl.LightningDataModule, IOMixin):
+    """PyTorch Lightning-compatible data module for pre-training
+       T5-style models.
+    Args:
+        paths (Path | List | Dict[str, List]): Paths of the data distributions. Can be either a
+            single path, a list of paths, or a dictionary. If a single path or a list of paths,
+            the given paths will be used to generate the train, validation and test datasets. If
+            providing a list of paths, the format can be either (1) a list of paths, e.g.
+                ["path/to/dataset_1_prefix", "path/to/dataset_2_prefix"],
+            or (2) a flattened, zipped list of weights and paths, e.g.
+                ["30", "path/to/dataset_1_prefix", "70", "path/to/dataset_2_prefix"]
+            If a dictionary is provided, it is expected to have the following form:
+                {
+                    'train': <TRAIN PATHS>,
+                    'validation': <VALID PATHS>,
+                    'test': <TEST PATHS>
+                }
+            where each value is either a path or a list of paths as described above.
+            In this case, each split will be generated using the given paths.
+            Note that if limit_val_batches <= 1, we generate the entire validaton dataset, so
+            weights should not be provided for the validation split.
+        seq_length (int): Sequence length.
+        seq_length_dec (int): Sequence length of decoder.
+        tokenizer (Optional["TokenizerSpec"]): An instance of a TokenizerSpec object.
+        micro_batch_size (int): Batch size per GPU.
+        global_batch_size (int): Global batch size.
+        rampup_batch_size (Optional[List[int]]): Rampup batch size, should be in format of
+            [start_global_batch_size, batch_size_increment, ramup_samples].
+        num_workers (int): See ``torch.utils.data.DataLoader`` documentation.
+        pin_memory (bool): See ``torch.utils.data.DataLoader`` documentation.
+        persistent_workers (bool): See ``torch.utils.data.DataLoader`` documentation.
+        masking_probability (float):
+        short_sequence_probability (float):
+        masking_max_ngram (int):
+        masking_do_full_word (bool):
+        masking_do_permutation (bool):
+        masking_use_longer_ngrams (bool):
+        masking_use_geometric_distribution (bool):
+        seed (int): Seed for generating the T5 dataset.
+        split (str): A string of 3 comma-separated integers denoting how much of the distribution
+            to allocate to train, validation, and test sets, respectively. Unused if ``paths`` is a dict.
+        index_mapping_dir (Optional[str]): Path to a directory to write index mapping files.
+    """
+
+    def __init__(
+        self,
+        paths: Path | List | Dict[str, List],
+        seq_length: int = 512,
+        seq_length_dec: int = 128,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        micro_batch_size: int = 64,
+        global_batch_size: int = 512,
+        rampup_batch_size: Optional[List[int]] = None,
+        num_workers: int = 8,
+        pin_memory: bool = True,
+        persistent_workers: bool = False,
+        masking_probability: float = 0.15,
+        short_sequence_probability: float = 0.1,
+        masking_max_ngram: int = 10,
+        masking_do_full_word: bool = True,
+        masking_do_permutation: bool = False,
+        masking_use_longer_ngrams: bool = False,
+        masking_use_geometric_distribution: bool = True,
+        seed: int = 1234,
+        split: str = "999982,9,9",
+        index_mapping_dir: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        if not isinstance(paths, (list, tuple, dict)):
+            paths = [paths]
+
+        from megatron.core.datasets.utils import get_blend_from_list
+
+        build_kwargs = {}
+        if isinstance(paths, dict):
+            if split is not None:
+                warnings.warn(
+                    f"{split=} will be ignored since datasets are being created " f"from 3 separate distributions."
+                )
+            build_kwargs["blend_per_split"] = [
+                get_blend_from_list(paths["train"]),
+                get_blend_from_list(paths["validation"]),
+                get_blend_from_list(paths["test"]),
+            ]
+        else:
+            paths, weights = get_blend_from_list(paths)
+            if len(paths) == 1:
+                weights = None
+            build_kwargs["blend"] = [paths, weights]
+            build_kwargs["split"] = split
+
+        self.build_kwargs = build_kwargs
+        self.seq_length = seq_length
+        self.seq_length_dec = seq_length_dec
+        self.tokenizer = tokenizer
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.persistent_workers = persistent_workers
+        self.masking_probability = masking_probability
+        self.short_sequence_probability = short_sequence_probability
+        self.masking_max_ngram = masking_max_ngram
+        self.masking_do_full_word = masking_do_full_word
+        self.masking_do_permutation = masking_do_permutation
+        self.masking_use_longer_ngrams = masking_use_longer_ngrams
+        self.masking_use_geometric_distribution = masking_use_geometric_distribution
+        self.seed = seed
+        self.split = split
+        self.index_mapping_dir = index_mapping_dir
+        self.init_global_step = 0
+
+        # add additional tokens for T5 tokenizer
+        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+        self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
+        additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
+        self.tokenizer.add_special_tokens(additional_tokens)
+
+        self.data_sampler = MegatronDataSampler(
+            seq_len=self.seq_length,
+            micro_batch_size=micro_batch_size,
+            global_batch_size=global_batch_size,
+            rampup_batch_size=rampup_batch_size,
+        )
+
+    def setup(self, stage: str = "") -> None:
+        from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+        from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset
+
+        assert (
+            hasattr(self, "trainer") and self.trainer is not None
+        ), "Setup should be completed when trainer and config are attached."
+
+        # Trainer API
+        max_train_steps = self.trainer.max_steps
+        assert max_train_steps > 0, "Please specify trainer.max_steps"
+        eval_iters = (max_train_steps // self.trainer.val_check_interval + 1) * self.trainer.limit_val_batches
+        test_iters = self.trainer.limit_test_batches
+        num_train_samples = int(max_train_steps * self.data_sampler.global_batch_size)
+        num_val_samples = int(eval_iters * self.data_sampler.global_batch_size)
+        num_test_samples = int(test_iters * self.data_sampler.global_batch_size)
+
+        if self.trainer.limit_val_batches <= 1.0 and isinstance(self.trainer.limit_val_batches, float):
+            assert "blend" not in self.build_kwargs, (
+                "When using a single data distribution, limit_val_batches <= 1.0 is not supported. If you'd "
+                "like to run with a fractional value of limit_val_batches, please pass in separate datasets for "
+                "the train, validation, and test datasets by providing a dictionary of paths, e.g.: \n"
+                "    paths={ \n "
+                "        'train': [PATHS FOR TRAIN], \n "
+                "        'validation': [PATHS FOR VALIDATION], \n "
+                "        'test' :[PATHS FOR TEST],  \n"
+                "    }"
+            )
+
+            # This is to make sure we only have one epoch on every validation iteration
+            num_val_samples = None
+
+        train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
+        self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+            T5MaskedWordPieceDataset,
+            train_valid_test_num_samples,
+            is_built_on_rank=lambda: True,
+            config=self.t5_dataset_config,
+        ).build()
+
+    # uncomment once fabric API is merged
+    # def fabric_setup(
+    #     self,
+    #     fabric: fl.Fabric,
+    #     num_train_samples: int,
+    #     num_val_samples: int,
+    #     num_test_samples: int,
+    # ) -> None:
+    #     from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+    #     from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset
+    #
+    #     del fabric
+    #     train_valid_test_num_samples = [num_train_samples, num_val_samples, num_test_samples]
+    #     self._train_ds, self._validation_ds, self._test_ds = BlendedMegatronDatasetBuilder(
+    #         T5MaskedWordPieceDataset, train_valid_test_num_samples, self.t5_dataset_config,
+    #     ).build()
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        return self._create_dataloader(self._train_ds, mode='train')
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._validation_ds, mode='validation')
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        return self._create_dataloader(self._test_ds, mode='test')
+
+    def _create_dataloader(self, dataset, mode, **kwargs) -> WrappedDataLoader:
+        self.init_global_step = self.trainer.global_step
+        dataloader = WrappedDataLoader(
+            mode=mode,
+            dataset=dataset,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            persistent_workers=self.persistent_workers,
+            collate_fn=getattr(dataset, 'collate_fn', data.dataloader.default_collate),
+            **kwargs,
+        )
+        return dataloader
+
+    @property
+    def t5_dataset_config(self) -> "T5MaskedWordPieceDatasetConfig":
+        from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDatasetConfig
+
+        return T5MaskedWordPieceDatasetConfig(
+            random_seed=self.seed,
+            sequence_length=self.seq_length,
+            sequence_length_decoder=self.seq_length_dec,
+            tokenizer=self.tokenizer,
+            path_to_cache=self.index_mapping_dir,
+            masking_probability=self.masking_probability,
+            short_sequence_probability=self.short_sequence_probability,
+            masking_max_ngram=self.masking_max_ngram,
+            masking_do_full_word=self.masking_do_full_word,
+            masking_do_permutation=self.masking_do_permutation,
+            masking_use_longer_ngrams=self.masking_use_longer_ngrams,
+            masking_use_geometric_distribution=self.masking_use_geometric_distribution,
+            **self.build_kwargs,
+        )
+
+    def state_dict(self) -> Dict[str, Any]:
+        """Called when saving a checkpoint, implement to generate and save datamodule state.
+
+        Returns:
+            A dictionary containing datamodule state.
+
+        """
+        consumed_samples = self.data_sampler.compute_consumed_samples(self.trainer.global_step - self.init_global_step)
+        return {'consumed_samples': consumed_samples}
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        """Called when loading a checkpoint, implement to reload datamodule state given datamodule stat
+
+        Args:
+            state_dict: the datamodule state returned by ``state_dict``.
+
+        """
+        try:
+            from megatron.core.num_microbatches_calculator import update_num_microbatches
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import update_num_microbatches
+
+        consumed_samples = state_dict['consumed_samples']
+        self.data_sampler.init_consumed_samples = consumed_samples
+        self.data_sampler.prev_consumed_samples = consumed_samples
+
+        update_num_microbatches(
+            consumed_samples=consumed_samples,
+            consistency_check=False,
+        )
+        self.data_sampler.if_first_step = 1
+
+    def reconfigure_limit_batches(self):
+        # Override limit_train_batches in terms of num of microbatches
+        self._reconfigure_limit_batches(self.trainer.limit_train_batches, self._train_ds, 'train')
+        # Override limit_val_batches to be a multiple of num microbatches to prevent val_step from exiting in between a step
+        self._reconfigure_limit_batches(self.trainer.limit_val_batches, self._validation_ds, 'val')
+
+    def _reconfigure_limit_batches(self, limit_batches, dataloader, mode):
+        """
+        Reconfigure trainer.limit_val_batches for pretraining
+        """
+        # Override limit_batches in terms of num microbatches and so there are limit_batches//num_micro_batches num of global batches
+        try:
+            from megatron.core.num_microbatches_calculator import get_num_microbatches
+
+        except (ImportError, ModuleNotFoundError):
+            logging.warning("Megatron num_microbatches_calculator not found, using Apex version.")
+            from apex.transformer.pipeline_parallel.utils import get_num_microbatches
+
+        if isinstance(limit_batches, int):
+            limit_batches *= get_num_microbatches()
+        else:
+            assert isinstance(limit_batches, float)
+            # Don't reconfigure if limit_batches is 0.0 or if there's no dataloader
+            if limit_batches == 0.0 or dataloader is None:
+                return
+            # len(dataloader) returns len as num of microbatches
+            dl_len_in_micro_batches = len(dataloader)
+            if len(dataloader) != float("inf"):
+                if limit_batches == 1.0:
+                    limit_batches = dl_len_in_micro_batches
+                else:
+                    limit_micro_batches = int(dl_len_in_micro_batches * limit_batches)
+                    if limit_micro_batches == 0 and limit_batches > 0.0:
+                        min_percentage = 1.0 / len(dataloader)
+                        raise MisconfigurationException(
+                            f"You requested to check {limit_batches} of the val_dataloader but"
+                            f" {limit_batches} * {len(dataloader)} < 1. Please increase the"
+                            f" `limit_val_batches` argument. Try at least"
+                            f" `limit_val_batches={min_percentage}`"
+                        )
+                    # Make sure trainer.limit_val_batches is a multiple of num of microbatches
+                    if limit_micro_batches < get_num_microbatches():
+                        limit_batches = get_num_microbatches()
+                    else:
+                        limit_batches = limit_batches - limit_batches % get_num_microbatches()
+
+        if mode == 'train':
+            self.trainer.limit_train_batches = limit_batches
+        else:
+            self.trainer.limit_val_batches = limit_batches
+
+        # Override num sanity steps to be a multiple of num of microbatches
+        self.trainer.num_sanity_val_steps *= get_num_microbatches()
diff --git a/nemo/collections/llm/t5/model/__init__.py b/nemo/collections/llm/t5/model/__init__.py
new file mode 100644
index 000000000000..088173857efd
--- /dev/null
+++ b/nemo/collections/llm/t5/model/__init__.py
@@ -0,0 +1,19 @@
+from nemo.collections.llm.t5.model.t5 import (
+    MaskedTokenLossReduction,
+    T5Config,
+    T5Model,
+    local_layer_spec,
+    t5_data_step,
+    t5_forward_step,
+    transformer_engine_layer_spec,
+)
+
+__all__ = [
+    "T5Config",
+    "T5Model",
+    "MaskedTokenLossReduction",
+    "t5_data_step",
+    "t5_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
+]
diff --git a/nemo/collections/llm/t5/model/t5.py b/nemo/collections/llm/t5/model/t5.py
new file mode 100644
index 000000000000..2df5d633e200
--- /dev/null
+++ b/nemo/collections/llm/t5/model/t5.py
@@ -0,0 +1,255 @@
+import copy
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
+
+import pytorch_lightning as L
+import torch
+import torch.distributed
+from megatron.core.optimizer import OptimizerConfig
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_config import TransformerConfig
+from torch import nn
+
+from nemo.collections.llm import fn
+from nemo.lightning import get_vocab_size, io
+from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
+from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
+
+HAVE_TE = True
+try:
+    import transformer_engine
+except (ImportError, ModuleNotFoundError):
+    HAVE_TE = False
+
+if TYPE_CHECKING:
+    from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model
+
+    from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+
+
+def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
+    from megatron.core import parallel_state
+
+    batch = next(dataloader_iter)
+
+    _batch: dict
+    # TODO: to fix for running inferencing
+    if isinstance(batch, tuple) and len(batch) == 3:
+        _batch = batch[0]
+    else:
+        _batch = batch
+
+    # convert attention mask values from int to True/False
+    _batch['enc_mask'] = _batch['enc_mask'] < 0.5
+    _batch['dec_mask'] = _batch['dec_mask'] < 0.5
+    _batch['enc_dec_mask'] = _batch['enc_dec_mask'] < 0.5
+
+    required_keys = set()
+    required_keys.update(["enc_mask", "dec_mask", "enc_dec_mask"])
+    if parallel_state.is_pipeline_first_stage():
+        required_keys.update(("text_enc", "text_dec"))
+    if parallel_state.is_pipeline_last_stage():
+        required_keys.update(("labels", "loss_mask"))
+
+    output = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}
+
+    return output
+
+
+def t5_forward_step(model, batch) -> torch.Tensor:
+    forward_args = {
+        "encoder_input_ids": batch["text_enc"],
+        "decoder_input_ids": batch["text_dec"],
+        "encoder_attn_mask": batch["enc_mask"],
+        "decoder_attn_mask": batch["dec_mask"],
+        "encoder_decoder_attn_mask": batch["enc_dec_mask"],
+        "lm_labels": batch["labels"],
+    }
+
+    return model(**forward_args)
+
+
+def transformer_engine_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
+    from megatron.core.models.T5.t5_spec import (
+        get_t5_decoder_with_transformer_engine_block_spec,
+        get_t5_encoder_with_transformer_engine_block_spec,
+    )
+
+    en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(encoder_config.num_layers)
+    de_block_spec = get_t5_decoder_with_transformer_engine_block_spec(decoder_config.num_layers)
+
+    return [en_block_spec, de_block_spec]
+
+
+def local_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
+    from megatron.core.models.T5.t5_spec import (
+        get_t5_decoder_with_local_block_spec,
+        get_t5_encoder_with_local_block_spec,
+    )
+
+    en_block_spec = get_t5_encoder_with_local_block_spec(encoder_config.num_layers)
+    de_block_spec = get_t5_decoder_with_local_block_spec(decoder_config.num_layers)
+
+    return [en_block_spec, de_block_spec]
+
+
+def default_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
+    if HAVE_TE:
+        return transformer_engine_layer_spec(encoder_config, decoder_config)
+    else:
+        return local_layer_spec(encoder_config, decoder_config)
+
+
+@dataclass
+class T5Config(TransformerConfig, io.IOMixin):
+    # From megatron.core.models.t5.t5_model.T5Model
+    encoder_num_layers: int = None
+    fp16_lm_cross_entropy: bool = False
+    parallel_output: bool = True
+    share_embeddings_and_output_weights: bool = True
+    make_vocab_size_divisible_by: int = 128
+    position_embedding_type: Literal["learned_absolute", "rope"] = "learned_absolute"
+    max_position_embeddings: int = 512
+    rotary_percent: float = 1.0
+    seq_len_interpolation_factor: Optional[float] = None
+    encoder_pipeline_model_parallel_size: int = 0
+    attention_softmax_in_fp32: float = False
+    bias_activation_fusion: bool = True
+    masked_softmax_fusion: bool = True
+    persist_layer_norm: bool = True
+    bias_dropout_fusion: bool = True
+    deallocate_pipeline_outputs: bool = True
+    pipeline_model_parallel_split_rank: int = 0
+    num_moe_experts: int = 1
+    recompute_num_layers: int = 1
+    distribute_saved_activations: bool = False
+    enable_autocast: bool = False
+
+    transformer_layer_spec: Union[ModuleSpec, Callable[["T5Config"], ModuleSpec]] = default_layer_spec
+    forward_step_fn: Callable = t5_forward_step
+    data_step_fn: Callable = t5_data_step
+
+    def configure_model(self, tokenizer) -> "MCoreT5Model":
+        vp_size = self.virtual_pipeline_model_parallel_size
+        if vp_size:
+            p_size = self.pipeline_model_parallel_size
+            assert (
+                self.num_layers // p_size
+            ) % vp_size == 0, "Make sure the number of model chunks is the same across all pipeline stages."
+
+        from megatron.core import parallel_state
+        from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model
+
+        encoder_config = copy.deepcopy(self)
+        encoder_config.num_layers = self.encoder_num_layers
+        if self.pipeline_model_parallel_size > 1:
+            assert self.encoder_pipeline_model_parallel_size > 0, "Need to know how to shard the encoder & decoder."
+            encoder_config.pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size
+
+        transformer_layer_spec = self.transformer_layer_spec
+        if not isinstance(transformer_layer_spec, ModuleSpec):
+            transformer_layer_spec = transformer_layer_spec(encoder_config=encoder_config, decoder_config=self)
+
+        model = MCoreT5Model(
+            config=self,
+            encoder_config=encoder_config,
+            transformer_encoder_layer_spec=transformer_layer_spec[0],
+            transformer_decoder_layer_spec=transformer_layer_spec[1],
+            vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
+            max_sequence_length=self.max_position_embeddings,
+            fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
+            parallel_output=self.parallel_output,
+            share_embeddings_and_output_weights=self.share_embeddings_and_output_weights,
+            position_embedding_type=self.position_embedding_type,
+            rotary_percent=self.rotary_percent,
+            seq_len_interpolation_factor=self.seq_len_interpolation_factor,
+            pre_process=parallel_state.is_pipeline_first_stage(),
+            post_process=parallel_state.is_pipeline_last_stage(),
+        )
+
+        return model
+
+
+class T5Model(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
+    def __init__(
+        self,
+        config: T5Config,
+        # TODO: Add transformer_layer_spec when we update mcore
+        optim: Optional[OptimizerModule] = None,
+        tokenizer: Optional["TokenizerSpec"] = None,
+        model_transform: Optional[Callable[[nn.Module], nn.Module]] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.optim = optim or MegatronOptimizerModule(config=OptimizerConfig(lr=1e-4, use_distributed_optimizer=True))
+        self.optim.connect(self)  # This will bind the `configure_optimizers` method
+        self.model_transform = model_transform
+        self._training_loss_reduction = None
+        self._validation_loss_reduction = None
+
+    def configure_model(self) -> None:
+        if not hasattr(self, "module"):
+            self.module = self.config.configure_model(self.tokenizer)
+
+    def forward(
+        self,
+        encoder_input_ids: torch.Tensor,
+        decoder_input_ids: torch.Tensor,
+        encoder_attn_mask: torch.Tensor,
+        decoder_attn_mask: torch.Tensor,
+        encoder_decoder_attn_mask: torch.Tensor,
+        lm_labels: Optional[torch.Tensor] = None,
+        inference_params=None,
+    ) -> torch.Tensor:
+
+        output_tensor = self.module(
+            encoder_input_ids=encoder_input_ids,
+            decoder_input_ids=decoder_input_ids,
+            encoder_attn_mask=encoder_attn_mask,
+            decoder_attn_mask=decoder_attn_mask,
+            encoder_decoder_attn_mask=encoder_decoder_attn_mask,
+            lm_labels=lm_labels,
+            inference_params=inference_params,
+        )
+
+        return output_tensor
+
+    def data_step(self, dataloader_iter) -> Dict[str, torch.Tensor]:
+        return self.config.data_step_fn(dataloader_iter)
+
+    def forward_step(self, batch) -> torch.Tensor:
+        return self.config.forward_step_fn(self, batch)
+
+    def training_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+        return self.forward_step(batch)
+
+    def validation_step(self, batch, batch_idx=None) -> torch.Tensor:
+        # In mcore the loss-function is part of the forward-pass (when labels are provided)
+
+        return self.forward_step(batch)
+
+    @property
+    def training_loss_reduction(self) -> MaskedTokenLossReduction:
+        if not self._training_loss_reduction:
+            self._training_loss_reduction = MaskedTokenLossReduction()
+
+        return self._training_loss_reduction
+
+    @property
+    def validation_loss_reduction(self) -> MaskedTokenLossReduction:
+        if not self._validation_loss_reduction:
+            self._validation_loss_reduction = MaskedTokenLossReduction(validation_step=True)
+
+        return self._validation_loss_reduction
+
+
+__all__ = [
+    "T5Model",
+    "T5Config",
+    "t5_data_step",
+    "t5_forward_step",
+    "transformer_engine_layer_spec",
+    "local_layer_spec",
+]
diff --git a/nemo/collections/multimodal/data/__init__.py b/nemo/collections/multimodal/data/__init__.py
index 01b98aecaecd..7e6ac24828f5 100644
--- a/nemo/collections/multimodal/data/__init__.py
+++ b/nemo/collections/multimodal/data/__init__.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo.utils.import_utils import safe_import_from
 
-from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
-
+SimpleMultiModalDataModule, _ = safe_import_from(
+    "nemo.collections.multimodal.data.energon", "SimpleMultiModalDataModule"
+)
 __all__ = ["SimpleMultiModalDataModule"]
diff --git a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
index d38de8eb10b9..2c3b30f2fc74 100644
--- a/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
+++ b/nemo/collections/multimodal/models/vision_language_foundation/clip/megatron_clip_models.py
@@ -424,26 +424,24 @@ def __init__(self, *args, **kwargs):
         # TODO (yuya): need to handle post_process correctly in order to enable PP
         self.output_dim = kwargs.pop('output_dim')
         super().__init__(*args, **kwargs)
-        if self.post_process:
-            self.final_layernorm = TENorm(
-                config=self.config,
-                hidden_size=self.config.hidden_size,
-                eps=self.config.layernorm_epsilon,
-            )
-            self.head = torch.nn.Linear(
-                self.config.hidden_size,
-                self.output_dim,
-                bias=False,
-            )
+        self.final_layernorm = TENorm(
+            config=self.config,
+            hidden_size=self.config.hidden_size,
+            eps=self.config.layernorm_epsilon,
+        )
+        self.head = torch.nn.Linear(
+            self.config.hidden_size,
+            self.output_dim,
+            bias=False,
+        )
 
     def forward(self, x):
         x = super().forward(
             x,
         )
-        if self.post_process:
-            x = self.final_layernorm(x)
-            x = x[:, 0]
-            x = self.head(x)
+        x = self.final_layernorm(x)
+        x = x[:, 0]
+        x = self.head(x)
         return x
 
 
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index e05c61bf3d24..046e032093b1 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -13,15 +13,15 @@
 # limitations under the License.
 
 try:
-    from megatron.core.extensions.transformer_engine import TENorm
+    from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
     from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
     from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-    from megatron.core.transformer.dot_product_attention import DotProductAttention
     from megatron.core.transformer.enums import AttnMaskType
     from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.mlp import MLP, MLPSubmodules
-    from megatron.core.transformer.moe.moe_layer import MoELayer
+    from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
+    from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
     from megatron.core.transformer.spec_utils import ModuleSpec
     from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
@@ -31,26 +31,25 @@
 
     from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 
-    TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults
-    MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults
-    AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults
-    ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults
-
+    ModuleSpec = ApexGuardDefaults
     HAVE_MEGATRON_CORE = False
     IMPORT_ERROR = e
 
 
 # Use this spec for Model Optimizer PTQ and TensorRT-LLM export
 def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
-    """Mix the native spec with TENorm.
+    """Mix the native spec with TENorm and TEDotProductAttention.
 
     This is essentially the native local spec except for the layernorm implementation
     is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
     prevents the apex dependency.
+
+    TEDotProductAttention is used to support sliding window attention.
     """
     if not HAVE_MEGATRON_CORE:
-        raise Exception(IMPORT_ERROR)
+        raise IMPORT_ERROR
 
+    mlp = _get_mlp_module_spec(num_experts=num_experts)
     return ModuleSpec(
         module=TransformerLayer,
         submodules=TransformerLayerSubmodules(
@@ -60,7 +59,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
                 params={"attn_mask_type": AttnMaskType.causal},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
+                    core_attention=TEDotProductAttention,
                     linear_proj=RowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,
@@ -68,7 +67,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
             ),
             self_attn_bda=get_bias_dropout_add,
             pre_mlp_layernorm=TENorm,
-            mlp=_get_mlp_module_spec(num_experts=num_experts),
+            mlp=mlp,
             mlp_bda=get_bias_dropout_add,
             # Map TE-layernorm-fusion keys back
             sharded_state_dict_keys_map={
@@ -80,7 +79,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
 
 
 # Helper function to get module spec for MLP/MoE
-def _get_mlp_module_spec(num_experts: int = None, moe_grouped_gemm: bool = False) -> ModuleSpec:
+def _get_mlp_module_spec(num_experts: int = None) -> ModuleSpec:
     if num_experts is None:
         # Dense MLP w/ or w/o TE modules.
         return ModuleSpec(
@@ -94,12 +93,18 @@ def _get_mlp_module_spec(num_experts: int = None, moe_grouped_gemm: bool = False
         # Mixture of experts with modules in megatron core.
         return ModuleSpec(
             module=MoELayer,
-            submodules=(
-                MLPSubmodules(
+            submodules=MoESubmodules(
+                experts=MLPSubmodules(
                     linear_fc1=ColumnParallelLinear,
                     linear_fc2=RowParallelLinear,
-                )
-                if not moe_grouped_gemm
-                else None
+                ),
+                shared_experts=ModuleSpec(
+                    module=SharedExpertMLP,
+                    params={"gate": False},
+                    submodules=MLPSubmodules(
+                        linear_fc1=ColumnParallelLinear,
+                        linear_fc2=RowParallelLinear,
+                    ),
+                ),
             ),
         )
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 68c93415451f..2100e9c1ba8f 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -1068,16 +1068,19 @@ def dummy():
                 torch.distributed.barrier()
 
             # create nemo file from folder with all mp_ranks checkpoints
-            if (
-                app_state.pipeline_model_parallel_rank == 0
-                and app_state.tensor_model_parallel_rank == 0
-                and app_state.data_parallel_rank == 0
-            ):
-                with tempfile.TemporaryDirectory() as tmpdir:
+            if dist_ckpt:
+                should_move_data = is_global_rank_zero()
+            else:
+                should_move_data = (
+                    app_state.pipeline_model_parallel_rank == 0
+                    and app_state.tensor_model_parallel_rank == 0
+                    and app_state.data_parallel_rank == 0
+                )
 
+            if should_move_data:
+                with tempfile.TemporaryDirectory() as tmpdir:
                     if dist_ckpt:
                         shutil.move(str(dist_ckpt_dir), tmpdir)
-
                     elif app_state.pipeline_model_parallel_size == 1:
                         # move weights to the tmpdir
                         for tp_rank in range(app_state.tensor_model_parallel_size):
@@ -1123,6 +1126,9 @@ def dummy():
                         for file in os.listdir(tmpdir):
                             shutil.move(os.path.join(tmpdir, file), folder_path)
 
+            if torch.distributed.is_initialized():
+                torch.distributed.barrier()
+
         else:
             return super().save_to(model, save_path)
 
diff --git a/nemo/collections/tts/data/dataset.py b/nemo/collections/tts/data/dataset.py
index 348862ceddec..83d2b969ea91 100644
--- a/nemo/collections/tts/data/dataset.py
+++ b/nemo/collections/tts/data/dataset.py
@@ -504,7 +504,7 @@ def add_reference_audio(self, **kwargs):
             raise NotImplementedError(f"Reference audio type \"{reference_audio_type}\" is not supported.")
 
     def get_spec(self, audio):
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(audio.device.type, enabled=False):
             spec = self.stft(audio)
             if spec.dtype in [torch.cfloat, torch.cdouble]:
                 spec = torch.view_as_real(spec)
@@ -512,7 +512,7 @@ def get_spec(self, audio):
         return spec
 
     def get_log_mel(self, audio):
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(audio.device.type, enabled=False):
             spec = self.get_spec(audio)
             mel = torch.matmul(self.fb.to(spec.dtype), spec)
             log_mel = torch.log(torch.clamp(mel, min=torch.finfo(mel.dtype).tiny))
@@ -652,7 +652,7 @@ def __getitem__(self, index):
                 sr=self.sample_rate,
                 fill_na=0.0,
             )
-            for (i, voiced_name, voiced_filepath) in non_exist_voiced_index:
+            for i, voiced_name, voiced_filepath in non_exist_voiced_index:
                 my_var.__setitem__(voiced_name, torch.from_numpy(voiced_tuple[i]).float())
                 torch.save(my_var.get(voiced_name), voiced_filepath)
 
@@ -859,9 +859,9 @@ def general_collate_fn(self, batch):
                 durations_list.append(general_padding(durations, len(durations), max_durations_len))
 
             if AlignPriorMatrix in self.sup_data_types_set:
-                align_prior_matrices[
-                    i, : align_prior_matrix.shape[0], : align_prior_matrix.shape[1]
-                ] = align_prior_matrix
+                align_prior_matrices[i, : align_prior_matrix.shape[0], : align_prior_matrix.shape[1]] = (
+                    align_prior_matrix
+                )
 
             if Pitch in self.sup_data_types_set:
                 pitches.append(general_padding(pitch, pitch_length.item(), max_pitches_len))
@@ -901,9 +901,9 @@ def general_collate_fn(self, batch):
             "p_voiced": torch.stack(p_voiceds) if P_voiced in self.sup_data_types_set else None,
             "audio_shifted": torch.stack(audios_shifted) if audio_shifted is not None else None,
             "reference_audio": torch.stack(reference_audios) if ReferenceAudio in self.sup_data_types_set else None,
-            "reference_audio_lens": torch.stack(reference_audio_lengths)
-            if ReferenceAudio in self.sup_data_types_set
-            else None,
+            "reference_audio_lens": (
+                torch.stack(reference_audio_lengths) if ReferenceAudio in self.sup_data_types_set else None
+            ),
         }
 
         return data_dict
@@ -1162,7 +1162,8 @@ def __len__(self):
 
 class PairedRealFakeSpectrogramsDataset(Dataset):
     def __init__(
-        self, manifest_filepath: Union[str, Path],
+        self,
+        manifest_filepath: Union[str, Path],
     ):
         manifest_filepath = Path(manifest_filepath)
         with Path(manifest_filepath).open() as f:
@@ -1215,7 +1216,6 @@ def __init__(
         speaker_stats_pitch_fp: Optional[Union[str, Path]] = None,
         speaker_conditioning_type: Optional[str] = "per_sample",  # per_sample, mean, interpolate,
     ):
-
         """Dataset used for training FastPitchModel_SSL model.
         Requires supplementary data created using scripts/ssl_tts/make_supdata.py
         Args:
@@ -1226,7 +1226,7 @@ def __init__(
                 "speaker" : <SPEAKER NUM>
                 "duration": <Duration of audio clip in seconds> (Optional)
             sample_rate (int): The sample rate of the audio. Or the sample rate that we will resample all files to.
-            ssl_content_emb_type (str): One of ["probs", "embedding", "log_probs", "embedding_and_probs"]. 
+            ssl_content_emb_type (str): One of ["probs", "embedding", "log_probs", "embedding_and_probs"].
                 Indicated which output to use as content embedding.
             max_duration (Optional[float]): Max duration of audio clips in seconds. All samples exceeding this will be
                 pruned prior to training. Note: Requires "duration" to be set in the manifest file. It does not load
@@ -1239,18 +1239,18 @@ def __init__(
             trim (bool): Whether to apply `librosa.effects.trim` to trim leading and trailing silence from an audio
                 signal. Defaults to False.
             pitch_conditioning (bool): Whether to load pitch contour or not
-            pitch_mean (Optional[float]): If using global normalization, normalize using these statistics. 
+            pitch_mean (Optional[float]): If using global normalization, normalize using these statistics.
                 Also used if speaker stats are not available for the given speaker
-            pitch_std (Optional[float]): If using global normalization, normalize using these statistics. 
+            pitch_std (Optional[float]): If using global normalization, normalize using these statistics.
                 Also used if speaker stats are not available for the given speaker
             pitch_normalization (str): Can be one of ['speaker_wise', 'global', 'none']. Indicates the kind of pitch normalization.
-            sup_data_dir (Optional[Union[str, Path]]): Data directory containing pre-computed embeddings/statistics. If set as 
-            speaker_stats_pitch_fp (Optional[Union[str, Path]]): Path to the json containing speaker pitch stats. 
-                If set as None, tries to lookup for a default filename (speaker_pitch_stats.json) in sup_data_dir. 
+            sup_data_dir (Optional[Union[str, Path]]): Data directory containing pre-computed embeddings/statistics. If set as
+            speaker_stats_pitch_fp (Optional[Union[str, Path]]): Path to the json containing speaker pitch stats.
+                If set as None, tries to lookup for a default filename (speaker_pitch_stats.json) in sup_data_dir.
                 Needed if we use pitch_normalization is "speaker_wise"
             speaker_conditioning_type (Optional[str]): Can be one of ["per_sample", "mean", "interpolate"]. Defaults to "per_sample"
                 per_sample: Speaker embedding computed from the same utterance
-                mean: Speaker embedding for all utterances of a given speaker is the same and equal to the mean speaker embedding. 
+                mean: Speaker embedding for all utterances of a given speaker is the same and equal to the mean speaker embedding.
                 interpolate: Interpolate b/w per_sample and mean speaker embedding.
         """
         assert ssl_content_emb_type in ["probs", "embedding", "log_probs", "embedding_and_probs"]
@@ -1328,7 +1328,10 @@ def __init__(
 
     def _get_wav_from_filepath(self, audio_filepath):
         features = AudioSegment.segment_from_file(
-            audio_filepath, target_sr=self.sample_rate, n_segments=-1, trim=self.trim,
+            audio_filepath,
+            target_sr=self.sample_rate,
+            n_segments=-1,
+            trim=self.trim,
         )
         audio_samples = features.samples
 
@@ -1531,7 +1534,7 @@ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
     Maintain similar input lengths in a batch.
     Length groups are specified by boundaries.
     Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
-  
+
     It removes samples which are not included in the boundaries.
     Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
     """
diff --git a/nemo/collections/tts/models/aligner.py b/nemo/collections/tts/models/aligner.py
index 9aeb5fbe23ca..72d023e9ee10 100644
--- a/nemo/collections/tts/models/aligner.py
+++ b/nemo/collections/tts/models/aligner.py
@@ -117,12 +117,14 @@ def _setup_tokenizer(self, cfg):
 
             if "phoneme_dict" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["phoneme_dict"] = self.register_artifact(
-                    'text_tokenizer.g2p.phoneme_dict', cfg.text_tokenizer.g2p.phoneme_dict,
+                    'text_tokenizer.g2p.phoneme_dict',
+                    cfg.text_tokenizer.g2p.phoneme_dict,
                 )
 
             if "heteronyms" in cfg.text_tokenizer.g2p:
                 g2p_kwargs["heteronyms"] = self.register_artifact(
-                    'text_tokenizer.g2p.heteronyms', cfg.text_tokenizer.g2p.heteronyms,
+                    'text_tokenizer.g2p.heteronyms',
+                    cfg.text_tokenizer.g2p.heteronyms,
                 )
 
             text_tokenizer_kwargs["g2p"] = instantiate(cfg.text_tokenizer.g2p, **g2p_kwargs)
@@ -130,7 +132,7 @@ def _setup_tokenizer(self, cfg):
         self.tokenizer = instantiate(cfg.text_tokenizer, **text_tokenizer_kwargs)
 
     def forward(self, *, spec, spec_len, text, text_len, attn_prior=None):
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(self.device.type, enabled=False):
             attn_soft, attn_logprob = self.alignment_encoder(
                 queries=spec,
                 keys=self.embed(text).transpose(1, 2),
@@ -236,7 +238,9 @@ def _loader(self, cfg):
             text_tokenizer=self.tokenizer,
         )
         return torch.utils.data.DataLoader(  # noqa
-            dataset=dataset, collate_fn=dataset.collate_fn, **cfg.dataloader_params,
+            dataset=dataset,
+            collate_fn=dataset.collate_fn,
+            **cfg.dataloader_params,
         )
 
     def setup_training_data(self, cfg):
diff --git a/nemo/collections/tts/modules/common.py b/nemo/collections/tts/modules/common.py
index 5f7d6153a7d1..cc7019439662 100644
--- a/nemo/collections/tts/modules/common.py
+++ b/nemo/collections/tts/modules/common.py
@@ -19,8 +19,6 @@
 import numpy as np
 import torch
 from torch import Tensor, nn
-from torch.cuda import amp
-from torch.cuda.amp import autocast as autocast
 from torch.nn import functional as F
 
 from nemo.collections.tts.modules.submodules import ConvNorm, LinearNorm, MaskedInstanceNorm1d
@@ -96,7 +94,7 @@ def lstm_nocast(self, context: Tensor, lens: Tensor) -> Tensor:
         dtype = context.dtype
         # autocast guard is only needed for Torchscript to run in Triton
         # (https://github.com/pytorch/pytorch/issues/89241)
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast(self.device.type, enabled=False):
             # Calculate sizes and prepare views to our zero buffer to pass as hx
             max_batch_size = context.shape[0]
             context = context.to(dtype=torch.float32)
@@ -171,7 +169,10 @@ def forward(self, context: Tensor, lens: Tensor) -> Tensor:
 
 
 def get_radtts_encoder(
-    encoder_n_convolutions=3, encoder_embedding_dim=512, encoder_kernel_size=5, norm_fn=MaskedInstanceNorm1d,
+    encoder_n_convolutions=3,
+    encoder_embedding_dim=512,
+    encoder_kernel_size=5,
+    norm_fn=MaskedInstanceNorm1d,
 ):
     return ConvLSTMLinear(
         in_dim=encoder_embedding_dim,
@@ -203,7 +204,7 @@ def __init__(self, c):
         self.upper_diag = nn.Parameter(torch.diag(upper))
         self.upper = nn.Parameter(torch.triu(upper, 1))
 
-    @amp.autocast(False)
+    @torch.amp.autocast(device_type='cuda', enabled=False)
     def forward(self, z, inverse=False):
         U = torch.triu(self.upper, 1) + torch.diag(self.upper_diag)
         L = torch.tril(self.lower, -1) + torch.diag(self.lower_diag)
@@ -280,7 +281,7 @@ def __init__(
         out_channels = -1
         self.use_partial_padding = use_partial_padding
         for i in range(n_layers):
-            dilation = 2 ** i if with_dilation else 1
+            dilation = 2**i if with_dilation else 1
             padding = int((kernel_size * dilation - dilation) / 2)
             out_channels = min(max_channels, in_channels * 2)
             self.layers.append(
@@ -354,7 +355,7 @@ def __init__(
         self.end = end
 
         for i in range(n_layers):
-            dilation = 2 ** i
+            dilation = 2**i
             padding = int((kernel_size * dilation - dilation) / 2)
             in_layer = ConvNorm(
                 n_channels,
@@ -469,7 +470,7 @@ def forward(self, z, context, inverse=False):
         z_reshaped = z.permute(0, 2, 1).reshape(b_s * t_s, -1)
         affine_params = self.param_predictor(context)
         q_tilde = affine_params.permute(0, 2, 1).reshape(b_s * t_s, c_s, -1)
-        with amp.autocast(enabled=False):
+        with torch.amp.autocast(self.device.type, enabled=False):
             if self.use_quadratic:
                 w = q_tilde[:, :, : self.n_bins // 2]
                 v = q_tilde[:, :, self.n_bins // 2 :]
@@ -554,7 +555,7 @@ def forward(self, z, context, inverse=False, seq_lens=None):
         z_1_reshaped = z_1.permute(0, 2, 1).reshape(b_s * t_s, -1)
         q_tilde = affine_params.permute(0, 2, 1).reshape(b_s * t_s, n_half, self.n_bins)
 
-        with autocast(enabled=False):
+        with torch.amp.autocast(self.device.type, enabled=False):
             if self.use_quadratic:
                 w = q_tilde[:, :, : self.n_bins // 2]
                 v = q_tilde[:, :, self.n_bins // 2 :]
diff --git a/nemo/collections/vlm/neva/model/llava.py b/nemo/collections/vlm/neva/model/llava.py
index dc27f28373fa..da894f183bbf 100644
--- a/nemo/collections/vlm/neva/model/llava.py
+++ b/nemo/collections/vlm/neva/model/llava.py
@@ -111,7 +111,7 @@ def convert_state(self, source, target):
             "language_model.model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
             "language_model.model.norm.weight": "language_model.decoder.final_layernorm.weight",
             "language_model.lm_head.weight": "language_model.output_layer.weight",
-            "vision_tower.vision_model.*": "vision_model.vision_model.*",
+            "vision_tower.vision_model.**": "vision_model.vision_model.**",
         }
         if "vision_projection.encoder.linear_fc1.weight" in target.module.state_dict().keys():
             mapping.update(
diff --git a/nemo/deploy/multimodal/query_multimodal.py b/nemo/deploy/multimodal/query_multimodal.py
index 63e6a3e8c3a6..bf11062f0be1 100644
--- a/nemo/deploy/multimodal/query_multimodal.py
+++ b/nemo/deploy/multimodal/query_multimodal.py
@@ -105,6 +105,7 @@ def query(
         repetition_penalty=1.0,
         num_beams=1,
         init_timeout=60.0,
+        lora_uids=None,
     ):
 
         prompts = str_list2numpy([input_text])
@@ -137,6 +138,10 @@ def query(
         if num_beams is not None:
             inputs["num_beams"] = np.full(prompts.shape, num_beams, dtype=np.int_)
 
+        if lora_uids is not None:
+            lora_uids = np.char.encode(lora_uids, "utf-8")
+            inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
+
         with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
             result_dict = client.infer_batch(**inputs)
             output_type = client.model_config.outputs[0].dtype
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
index 53c598be47c6..ed300846c8e7 100644
--- a/nemo/export/multimodal/build.py
+++ b/nemo/export/multimodal/build.py
@@ -19,6 +19,7 @@
 import tempfile
 from pathlib import Path
 from time import time
+from typing import List
 
 import tensorrt as trt
 import torch
@@ -47,11 +48,14 @@ def build_trtllm_engine(
     max_batch_size: int = 1,
     max_multimodal_len: int = 1024,
     dtype: str = "bfloat16",
+    use_lora_plugin: str = None,
+    lora_target_modules: List[str] = None,
+    max_lora_rank: int = 64,
+    lora_ckpt_list: List[str] = None,
 ):
-    trt_llm_exporter = TensorRTLLM(model_dir=model_dir, load_model=False)
-    visual_checkpoint_model = ['neva', 'lita', 'vila', 'vita']
+    trt_llm_exporter = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False)
     trt_llm_exporter.export(
-        nemo_checkpoint_path=visual_checkpoint_path if model_type in visual_checkpoint_model else llm_checkpoint_path,
+        nemo_checkpoint_path=visual_checkpoint_path if llm_checkpoint_path is None else llm_checkpoint_path,
         model_type=llm_model_type,
         tensor_parallelism_size=tensor_parallelism_size,
         max_input_len=max_input_len,
@@ -60,6 +64,9 @@ def build_trtllm_engine(
         max_prompt_embedding_table_size=max_multimodal_len,
         dtype=dtype,
         load_model=False,
+        use_lora_plugin=use_lora_plugin,
+        lora_target_modules=lora_target_modules,
+        max_lora_rank=max_lora_rank,
     )
 
 
@@ -211,9 +218,22 @@ def build_neva_engine(
     vision_max_batch_size: int = 1,
 ):
     device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
-    # extract NeMo checkpoint
-    with tempfile.TemporaryDirectory() as temp:
-        temp_path = Path(temp)
+
+    if os.path.isdir(visual_checkpoint_path):
+        # load untar checkpoint
+        config_path = os.path.join(visual_checkpoint_path, 'model_config.yaml')
+        with open(config_path, 'r') as f:
+            nemo_config = yaml.safe_load(f)
+        try:
+            weights_path = os.path.join(visual_checkpoint_path, 'model_weights.ckpt')
+            mp0_weights = torch.load(weights_path, map_location=device)
+        except FileNotFoundError:
+            weights_path = os.path.join(visual_checkpoint_path, 'mp_rank_00/model_weights.ckpt')
+            mp0_weights = torch.load(weights_path, map_location=device)
+    else:
+        # extract NeMo checkpoint
+        with tempfile.TemporaryDirectory() as temp:
+            temp_path = Path(temp)
         mp0_weights, nemo_config, _ = load_nemo_model(visual_checkpoint_path, temp_path)
 
     vision_config = nemo_config["mm_cfg"]["vision_encoder"]
@@ -496,3 +516,37 @@ def build_visual_engine(
         build_video_neva_engine(model_dir, visual_checkpoint_path, vision_max_batch_size)
     else:
         raise RuntimeError(f"Invalid model type {model_type}")
+
+
+def extract_lora_ckpt(
+    lora_ckpt: str,
+    output_dir: str,
+):
+    if os.path.exists(os.path.join(lora_ckpt, "model_weights.ckpt")):
+        model_weight = torch.load(os.path.join(lora_ckpt, "model_weights.ckpt"))
+    elif os.path.exists(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")):
+        model_weight = torch.load(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt"))
+    else:
+        raise RuntimeError(f"Imcompatible lora checkpoint format")
+
+    model_config = os.path.join(lora_ckpt, "model_config.yaml")
+
+    if not os.path.exists(model_config):
+        raise RuntimeError(f"Imcompatible lora checkpoint format")
+
+    llm_lora_weight = {}
+
+    for k, v in model_weight.items():
+        if "mm_projector" not in k:
+            llm_lora_weight[k] = v
+
+    llm_lora_path = os.path.join(output_dir, "llm_lora.nemo")
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        llm_weight_path = os.path.join(tmp_dir, "model_weights.ckpt")
+        torch.save(llm_lora_weight, llm_weight_path)
+
+        with tarfile.open(llm_lora_path, "w") as tar:
+            tar.add(llm_weight_path, arcname="model_weights.ckpt")
+            tar.add(model_config, arcname="model_config.yaml")
+
+    return llm_lora_path
diff --git a/nemo/export/multimodal/run.py b/nemo/export/multimodal/run.py
index 2cde46ca41fa..f64e413efb82 100644
--- a/nemo/export/multimodal/run.py
+++ b/nemo/export/multimodal/run.py
@@ -171,7 +171,10 @@ def init_vision_preprocessor(self, visual_encoder_dir):
 
     def init_llm(self, llm_engine_dir):
         self.model = ModelRunner.from_dir(
-            llm_engine_dir, rank=tensorrt_llm.mpi_rank(), debug_mode=False, stream=self.stream
+            llm_engine_dir,
+            rank=tensorrt_llm.mpi_rank(),
+            debug_mode=False,
+            stream=self.stream,
         )
         self.model_config = self.model.session._model_config
         self.runtime_mapping = self.model.session.mapping
@@ -380,6 +383,7 @@ def generate(
         temperature,
         repetition_penalty,
         num_beams,
+        lora_uids=None,
     ):
         if not warmup:
             profiler.start("Generate")
@@ -412,6 +416,7 @@ def generate(
             repetition_penalty=repetition_penalty,
             num_beams=num_beams,
             output_sequence_lengths=False,
+            lora_uids=lora_uids,
             return_dict=False,
         )
 
@@ -786,6 +791,7 @@ def run(
         temperature,
         repetition_penalty,
         num_beams,
+        lora_uids=None,
         run_profiling=False,
         check_accuracy=False,
     ):
@@ -807,6 +813,7 @@ def run(
             temperature=temperature,
             repetition_penalty=repetition_penalty,
             num_beams=num_beams,
+            lora_uids=lora_uids,
         )
         num_iters = self.profiling_iterations if run_profiling else 1
         for _ in range(num_iters):
@@ -824,6 +831,7 @@ def run(
                 temperature=temperature,
                 repetition_penalty=repetition_penalty,
                 num_beams=num_beams,
+                lora_uids=lora_uids,
             )
         if self.runtime_rank == 0:
             self.print_result(input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy)
@@ -1076,6 +1084,7 @@ def run(
         check_accuracy=False,
         input_signal=None,
         input_signal_length=None,
+        lora_uids=None,
     ):
         """
         Args:
diff --git a/nemo/export/tensorrt_mm_exporter.py b/nemo/export/tensorrt_mm_exporter.py
index d4da0ac34b1c..4ff756571d8c 100644
--- a/nemo/export/tensorrt_mm_exporter.py
+++ b/nemo/export/tensorrt_mm_exporter.py
@@ -15,14 +15,22 @@
 import logging
 import os
 import shutil
+import tempfile
 from pathlib import Path
+from typing import List
 
 import numpy as np
 import wrapt
 
 from nemo.deploy import ITritonDeployable
-from nemo.export.multimodal.build import build_perception_engine, build_trtllm_engine, build_visual_engine
+from nemo.export.multimodal.build import (
+    build_perception_engine,
+    build_trtllm_engine,
+    build_visual_engine,
+    extract_lora_ckpt,
+)
 from nemo.export.multimodal.run import MultimodalModelRunner, SpeechllmModelRunner
+from nemo.export.tarutils import unpack_tarball
 
 use_deploy = True
 try:
@@ -100,6 +108,10 @@ def export(
         dtype: str = "bfloat16",
         delete_existing_files: bool = True,
         load_model: bool = True,
+        use_lora_plugin: str = None,
+        lora_target_modules: List[str] = None,
+        lora_checkpoint_path: str = None,
+        max_lora_rank: int = 64,
     ):
         if Path(self.model_dir).exists():
             if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
@@ -117,6 +129,20 @@ def export(
         else:
             Path(self.model_dir).mkdir(parents=True, exist_ok=True)
 
+        if lora_checkpoint_path is not None:
+            tmp_dir = tempfile.TemporaryDirectory()
+            if os.path.isdir(lora_checkpoint_path):
+                lora_dir = lora_checkpoint_path
+            else:
+                lora_dir = os.path.join(tmp_dir.name, "unpacked_lora")
+                unpack_tarball(lora_checkpoint_path, lora_dir)
+
+            llm_lora_path = [extract_lora_ckpt(lora_dir, tmp_dir.name)]
+        else:
+            tmp_dir = None
+            llm_lora_path = None
+            lora_dir = None
+
         llm_dir = os.path.join(self.model_dir, "llm_engine")
         build_trtllm_engine(
             model_dir=llm_dir,
@@ -130,6 +156,10 @@ def export(
             max_batch_size=max_batch_size,
             max_multimodal_len=max_multimodal_len,
             dtype=dtype,
+            use_lora_plugin=use_lora_plugin,
+            lora_target_modules=lora_target_modules,
+            max_lora_rank=max_lora_rank,
+            lora_ckpt_list=llm_lora_path,
         )
 
         if model_type == "salm":
@@ -137,7 +167,12 @@ def export(
             build_perception_engine(perception_dir, visual_checkpoint_path, model_type, vision_max_batch_size)
         else:
             visual_dir = os.path.join(self.model_dir, "visual_engine")
-            build_visual_engine(visual_dir, visual_checkpoint_path, model_type, vision_max_batch_size)
+            build_visual_engine(
+                visual_dir, visual_checkpoint_path if lora_dir is None else lora_dir, model_type, vision_max_batch_size
+            )
+
+        if tmp_dir is not None:
+            tmp_dir.cleanup()
 
         if load_model:
             self._load()
@@ -153,6 +188,7 @@ def forward(
         temperature: float = 1.0,
         repetition_penalty: float = 1.0,
         num_beams: int = 1,
+        lora_uids: List[str] = None,
     ):
         if self.runner is None:
             raise Exception(
@@ -170,6 +206,7 @@ def forward(
             temperature,
             repetition_penalty,
             num_beams,
+            lora_uids,
         )
 
     def get_input_media_tensors(self):
@@ -195,6 +232,7 @@ def get_triton_input(self):
                 Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
                 Tensor(name="repetition_penalty", shape=(-1,), dtype=np.single, optional=True),
                 Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
             ]
         )
         inputs = tuple(inputs)
@@ -236,6 +274,9 @@ def triton_infer_fn(self, **inputs: np.ndarray):
                 infer_input["repetition_penalty"] = inputs.pop("repetition_penalty")[0][0]
             if "num_beams" in inputs:
                 infer_input["num_beams"] = inputs.pop("num_beams")[0][0]
+            if "lora_uids" in inputs:
+                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
+                infer_input["lora_uids"] = lora_uids[0].tolist()
 
             output_texts = self.runner.run(**infer_input)
             output = cast_output(output_texts, np.bytes_)
@@ -247,6 +288,8 @@ def triton_infer_fn(self, **inputs: np.ndarray):
 
     def _load(self):
         llm_dir = os.path.join(self.model_dir, "llm_engine")
+        if not os.path.exists(llm_dir):
+            return
         if self.modality == "vision":
             visual_dir = os.path.join(self.model_dir, "visual_engine")
             self.runner = MultimodalModelRunner(visual_dir, llm_dir, self.modality)
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index e37c3ba1c845..4720efc51e53 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -105,13 +105,14 @@ def build_and_save_engine(
 
     if use_lora_plugin is not None:
         # build_config.plugin_config.set_lora_plugin(use_lora_plugin)
-        # build_config.plugin_config._lora_plugin = use_lora_plugin
+        build_config.plugin_config._lora_plugin = use_lora_plugin
         lora_config = LoraConfig(
             lora_dir=lora_ckpt_list,
             lora_ckpt_source='nemo',
             max_lora_rank=max_lora_rank,
-            lora_target_modules=lora_target_modules,
         )
+        if lora_target_modules is not None:
+            lora_config.lora_target_modules = lora_target_modules
         build_config.lora_config = lora_config
 
     model = model_cls.from_config(model_config)
diff --git a/nemo/lightning/io/state.py b/nemo/lightning/io/state.py
index 2a4588617241..fc2281b9b063 100644
--- a/nemo/lightning/io/state.py
+++ b/nemo/lightning/io/state.py
@@ -326,8 +326,28 @@ def call_transform(self, ctx: TransformCTX, *args, **kwargs):
 
 
 def _match_keys(keys: List[str], pattern: str) -> np.ndarray:
-    regex_pattern = re.compile("^" + pattern.replace("*", r"([^.]+)") + "$")
-    wildcard_matches = [[] for _ in range(pattern.count("*"))]
+    escaped_pattern = ''
+    i = 0
+    wildcard_positions = []
+    while i < len(pattern):
+        if pattern[i : i + 2] == '**':
+            escaped_pattern += r'(.+)'  # Match any characters including dots
+            wildcard_positions.append('**')
+            i += 2
+        elif pattern[i] == '*':
+            escaped_pattern += r'([^.]+)'  # Match any characters except dots
+            wildcard_positions.append('*')
+            i += 1
+        else:
+            if pattern[i] == '.':
+                escaped_pattern += r'\.'  # Escape the dot
+            else:
+                escaped_pattern += pattern[i]
+            i += 1
+
+    regex_pattern = re.compile("^" + escaped_pattern + "$")
+    num_wildcards = len(wildcard_positions)
+    wildcard_matches = [[] for _ in range(num_wildcards)]
 
     for key in filter(lambda x: x is not None, keys):
         match = regex_pattern.match(key)
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 60f090d6318f..096c7728d4a1 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -18,6 +18,7 @@
 import inspect
 import queue
 from collections import defaultdict
+from dataclasses import dataclass
 from typing import (
     Any,
     Callable,
@@ -36,6 +37,7 @@
     runtime_checkable,
 )
 
+import pytorch_lightning as pl
 import torch
 import torch.distributed
 from megatron.core import parallel_state
@@ -48,6 +50,7 @@
 
 DataT = TypeVar("DataT", Tensor, Dict[str, Tensor], Sequence[Tensor])
 ModelT = TypeVar("ModelT", bound=nn.Module)
+T = TypeVar('T')
 
 
 @runtime_checkable
@@ -207,7 +210,7 @@ def forward(
         data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]],
         forward_only: bool = True,
         data_step: Optional[Callable[[Iterator[DataT]], DataT]] = None,
-        forward_step: Optional[Callable[[nn.Module, DataT], Tensor]] = None,
+        forward_step: Optional[Callable[[ModelT, DataT], Tensor]] = None,
         loss_reduction: Optional["MegatronLossReduction[DataT, Any]"] = None,
         seq_length: Optional[int] = None,
         micro_batch_size: Optional[int] = None,
@@ -238,97 +241,61 @@ def forward(
         """
         _forward_step = forward_step or self.forward_step
         _loss_reduction = loss_reduction or self.loss_reduction
-        _micro_batch_size: int = micro_batch_size or self.infer_micro_batch_size(data)
-        _seq_length: int = seq_length or self.infer_seq_length(data)
-        _num_microbatches: int = num_microbatches or self.infer_num_microbatches(data)
-
-        pipeline = self.pipeline
-
-        # FIXME: cleanup the following code block which is here for backwards compatibility with nemo1. The "batch"
-        #  sampler is a nemo1 sampler. It requires some custom code here to use (if use_global_batch_sampler).
-        #  by default we shouldn't use this "batch" sampler probably.
-        if getattr(self.trainer, "datamodule", None) is not None:
-            use_global_batch_sampler = self.trainer.datamodule.data_sampler.dataloader_type == 'batch'
-        elif getattr(self.trainer, "predict_dataloaders", None) is not None:
-            from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (  # noqa: I001
-                MegatronPretrainingBatchSampler,
-            )
-
-            # The batch_sampler gets injected into the dataloader by the data_sampler. When doing predict without a
-            #  datamodule we can look inside the dataloader's batch_sampler to see if it is the nemo1 style sampler
-            #  that we need to handle specially below.
-            use_global_batch_sampler = isinstance(
-                self.trainer.predict_dataloaders.batch_sampler, MegatronPretrainingBatchSampler
-            )
-        else:
-            raise ValueError("Unsure how to check for nemo1 global_batch_sampler status. TODO maybe default to False?")
-        if use_global_batch_sampler:
-            from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
-
-            # The current way of using a batch sampler + split to micro iterator results in
-            # extraneous padding, and is only implemented to ensure bit-exactness with NeMo 1.
-            # This part in NeMo 1 was written when megatron fwd_bwd_function did not support unequal
-            # sequence lengths, but it does now. Hence this part should be revisited in the future.
-            batch = next(data)
-            if isinstance(batch, tuple) and len(batch) == 3:
-                batch = batch[0]
-            data = get_iterator_k_split(batch, _num_microbatches, True)
-
-        data_iterator: List[Iterator[DataT]] = self.to_data_iterator_list(data)
-        context = self._build_context({**locals()})
+        _forward_context = {}
 
         if wrap_forward_step:
             _data_step = data_step or self.data_step
             forward_step_func = self.wrapped_forward_step(
-                _forward_step,
+                forward_step=_forward_step,
                 data_step=_data_step,
                 loss_reduction=_loss_reduction,
-                context=context,
+                context=_forward_context,
             )
         else:
             forward_step_func = _forward_step
 
-        self.callbacks.event("on_megatron_step_start", **context)
-        self.callbacks.event("on_megatron_microbatches_start", **context)
-
-        microbatch_outputs = self.forward_backward_func(
-            forward_step_func=forward_step_func,
-            data_iterator=data_iterator,
-            model=pipeline,
+        step = MegatronStep.infer(
+            self,
+            data,
+            forward_step_func,
             forward_only=forward_only,
-            micro_batch_size=_micro_batch_size,
-            seq_length=_seq_length,
-            num_microbatches=_num_microbatches,
+            micro_batch_size=micro_batch_size,
+            num_microbatches=num_microbatches,
+            seq_length=seq_length,
         )
+        _forward_context["step"] = step
+        step = self.callbacks.transform_event("on_megatron_step_start", step)
 
-        context["microbatch_outputs"] = microbatch_outputs
-
-        self.callbacks.event("on_megatron_microbatches_end", **context)
+        self.callbacks.event("on_megatron_microbatches_start", step=step)
+        microbatch_outputs = step()
+        self.callbacks.event("on_megatron_microbatches_end", step=step, microbatch_outputs=microbatch_outputs)
 
         if microbatch_outputs:
-            self.callbacks.event("on_megatron_reduce_microbatches_start", **context)
+            self.callbacks.event(
+                "on_megatron_reduce_microbatches_start", step=step, microbatch_outputs=microbatch_outputs
+            )
 
             if isinstance(_loss_reduction, _ModuleStepFunction):
                 _loss_reduction = _loss_reduction(self[0])
 
-            loss_mean = _loss_reduction.reduce(microbatch_outputs)
-            context["loss_mean"] = loss_mean
-            self.callbacks.event("on_megatron_reduce_microbatches_end", **context)
+            reduced = _loss_reduction.reduce(microbatch_outputs)
+            self.callbacks.event(
+                "on_megatron_reduce_microbatches_end",
+                step=step,
+                loss_reduction=_loss_reduction,
+                microbatch_outputs=microbatch_outputs,
+                reduced=reduced,
+            )
         else:
             # we're not on the last pipeline stage so no losses
-            loss_mean = torch.tensor(0.0, device=torch.cuda.current_device())
+            reduced = torch.tensor(0.0, device=torch.cuda.current_device())
 
-        self.callbacks.event("on_megatron_log_step_end", **context)
-        self.callbacks.event("on_megatron_step_end", **context)
+        self.callbacks.event("on_megatron_step_end", step=step, microbatch_outputs=microbatch_outputs, reduced=reduced)
 
-        return loss_mean
+        return reduced
 
     def wrapped_forward_step(
-        self,
-        forward_step,
-        loss_reduction,
-        context,
-        data_step,
+        self, forward_step, loss_reduction, data_step, context
     ) -> Callable[[nn.Module, DataT], Tuple[torch.Tensor, "MegatronCallbackProtocol"]]:
         """The method wraps the forward step function and returns a callable.
 
@@ -355,6 +322,7 @@ def wrapped_forward_step_func(dataloader_iter, model):
                 _data_step = data_step
 
             batch = _data_step(dataloader_iter)
+            step = context["step"]
 
             if isinstance(loss_reduction, _ModuleStepFunction):
                 forward_callback = loss_reduction(model)
@@ -366,10 +334,12 @@ def wrapped_forward_step_func(dataloader_iter, model):
             else:
                 _forward_step = forward_step
 
-            _context = {**context, "batch": batch}
-            _context["forward_callback"] = forward_callback
-
-            self.callbacks.event("on_megatron_microbatch_start", **_context)
+            self.callbacks.event(
+                "on_megatron_microbatch_start",
+                step=step,
+                batch=batch,
+                forward_callback=forward_callback,
+            )
 
             if self.precision_plugin and parallel_state.is_pipeline_first_stage():
                 batch = self.precision_plugin.convert_input(batch)
@@ -388,106 +358,18 @@ def wrapped_forward_step_func(dataloader_iter, model):
             if self.precision_plugin and parallel_state.is_pipeline_last_stage():
                 output_tensor = self.precision_plugin.convert_output(output_tensor)
 
+            self.callbacks.event(
+                "on_megatron_microbatch_end",
+                step=step,
+                batch=batch,
+                output=output_tensor,
+                forward_callback=forward_callback,
+            )
+
             return output_tensor, forward_callback
 
         return wrapped_forward_step_func
 
-    def to_data_iterator_list(
-        self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]
-    ) -> List[Iterator[DataT]]:
-        """
-        Converts the provided data into a list of iterators.
-
-        This method is used to convert the input data into a list of iterators that can be used
-        for data parallelism in the Megatron model. The input data can be a single data item,
-        an iterator, or a list of iterators.
-
-        Args:
-            data (Union[DataT, Iterator[DataT], List[Iterator[DataT]]]): The input data to be
-                converted into a list of iterators. This can be a single data item, an iterator,
-                or a list of iterators.
-
-        Returns
-        -------
-            List[Iterator[DataT]]: A list of iterators created from the input data.
-        """
-        if isinstance(data, Iterator):
-            return _make_data_iterator_list(self.pipeline, data)
-        elif isinstance(data, list) and all(isinstance(item, Iterator) for item in data):
-            # If data is already a list of iterators, return it as is
-            return cast(List[Iterator[DataT]], data)
-
-        # For a single data item or any other type, wrap it in an iterator and return as a list
-        return cast(List[Iterator[DataT]], [iter([data])])
-
-    def infer_micro_batch_size(self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]) -> int:
-        """
-        Infers the micro batch size from the provided data.
-
-        This method attempts to infer the micro batch size by checking for specific attributes
-        in the data object. If the data object has a `micro_batch_size` attribute, it is returned.
-        If the data object has a `data_config` attribute with a `micro_batch_size` attribute,
-        it is returned. Otherwise, the method attempts to infer the micro batch size from the
-        first dimension of the data tensor, if the data is a tensor. If the data is a dictionary,
-        the method is called recursively on the first value of the dictionary. If the data is a
-        list or tuple with at least one element, the method is called recursively on the first
-        element. If none of these conditions are met, a ValueError is raised.
-
-        Args:
-            data (Union[DataT, Iterator[DataT], List[Iterator[DataT]]]): The data to infer the
-                micro batch size from.
-
-        Returns
-        -------
-            int: The inferred micro batch size.
-
-        Raises
-        ------
-            ValueError: If the micro batch size cannot be inferred from the data.
-        """
-        if hasattr(data, "micro_batch_size"):
-            return data.micro_batch_size
-        if hasattr(data, "data_config"):
-            return data.data_config.micro_batch_size
-
-        if isinstance(data, Tensor):
-            return data.size(0)
-        elif isinstance(data, dict):
-            return self.infer_micro_batch_size(next(iter(data.values())))
-        elif isinstance(data, (list, tuple)) and len(data) > 0:
-            _tensor: Tensor = data[0]
-            return self.infer_micro_batch_size(_tensor)
-
-        raise ValueError("Cannot infer `micro_batch_size` from data, please specify it manually")
-
-    def infer_seq_length(self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]) -> int:
-        if hasattr(data, "seq_length"):
-            return data.seq_length
-        if hasattr(data, "data_config"):
-            return data.data_config.seq_length
-
-        if isinstance(data, Tensor):
-            # TODO: Check if at least 2 dims
-            return data.size(1)
-        elif isinstance(data, dict):
-            return self.infer_seq_length(next(iter(data.values())))
-        elif isinstance(data, (list, tuple)) and len(data) > 0:
-            _tensor: Tensor = data[0]
-            return self.infer_seq_length(_tensor)
-
-        raise ValueError("Cannot infer `seq_length` from data, please specify it manually")
-
-    def infer_num_microbatches(self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]) -> int:
-        if hasattr(data, "num_microbatches"):
-            return data.num_microbatches
-        if hasattr(data, "data_config"):
-            return data.data_config.num_microbatches
-
-        if isinstance(data, (dict, tuple, list, Tensor)):
-            return 1
-
-        raise ValueError("Cannot infer `num_microbatches` from data, please specify it manually")
-
     def init_model_parallel(self):
         from megatron.core import parallel_state
         from megatron.core.tensor_parallel.layers import set_defaults_if_not_set_tensor_model_parallel_attributes
@@ -564,27 +446,6 @@ def init_ddp(self):
             module.config.no_sync_func = no_sync_func
             module.config.grad_sync_func = grad_sync_func
 
-    def _build_context(self, context: Dict[str, Any]) -> Dict[str, Any]:
-        if "self" in context:
-            del context["self"]
-        context["pl_module"] = self
-        if hasattr(self, "trainer"):
-            context["trainer"] = self.trainer
-
-        for val in [
-            "data_step",
-            "forward_step",
-            "loss_reduction",
-            "micro_batch_size",
-            "seq_length",
-            "num_microbatches",
-        ]:
-            if "_" + val in context:
-                context[val] = context["_" + val]
-                del context["_" + val]
-
-        return context
-
     def _setup_module(self, function, **kwargs) -> None:
         if hasattr(function, "setup"):
             setup_args = inspect.getfullargspec(function.setup).args
@@ -646,12 +507,6 @@ def pipeline(self) -> Union[ModelT, List[ModelT]]:
     def module(self) -> ModelT:
         return self[0]
 
-    @property
-    def forward_backward_func(self) -> "MegatronStepProtocol":
-        from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
-
-        return get_forward_backward_func()
-
     @override
     def __getattr__(self, item: Any) -> Any:
         try:
@@ -860,6 +715,39 @@ def event(self, name: str, *args, **kwargs) -> None:
                     filtered_kwargs = {k: v for k, v in kwargs.items() if k in sig.parameters}
                     callback_method(*filtered_args, **filtered_kwargs)
 
+    def transform_event(self, name: str, obj: T, **kwargs) -> T:
+        """
+        Triggers an event that allows callbacks to transform and return an object.
+
+        This method applies a series of potential transformations to the input object
+        by calling registered callbacks. Each callback has the opportunity to modify
+        and return a new version of the object.
+
+        Parameters
+        ----------
+        name : str
+            The name of the event to trigger.
+        obj : T
+            The object to be potentially transformed by callbacks.
+        **kwargs : Any
+            Additional keyword arguments to pass to the callbacks.
+
+        Returns
+        -------
+        T
+            The potentially transformed object.
+        """
+        for callback in self.callbacks.get(name, []):
+            callback_method = getattr(callback, name, None)
+            if callable(callback_method):
+                result = callback_method(obj, **kwargs)
+
+                # Update obj if the callback returned a value of the same type
+                if result is not None and isinstance(result, type(obj)):
+                    obj = result
+
+        return obj
+
     def __add__(self, other) -> "CallbackConnector":
         """
         Adds another CallbackConnector's callbacks to this one.
@@ -945,22 +833,445 @@ def __contains__(self, callback_object) -> bool:
         return False
 
 
+@dataclass
+class MegatronStep(Generic[ModelT, DataT]):
+    """
+    Represents a single step in the Megatron model's training or inference process.
+
+    This class encapsulates all the necessary information and logic for executing
+    a single step (forward pass, and optionally backward pass) in the Megatron model.
+    It handles data preparation, model execution, and provides utilities for inferring
+    batch sizes and sequence lengths.
+
+    Attributes:
+        pipeline (MegatronParallel[ModelT]): The Megatron parallel model pipeline.
+        data (Union[DataT, Iterator[DataT], List[Iterator[DataT]]]): Input data for the step.
+        forward_step_func (Callable): Function to perform the forward step.
+        forward_only (bool): If True, only perform forward pass (no backward pass).
+        micro_batch_size (Optional[int]): Size of each micro-batch.
+        seq_length (Optional[int]): Sequence length for the current step.
+        num_microbatches (Optional[int]): Number of micro-batches in this step.
+
+    Type Parameters:
+        ModelT: The type of the model being used.
+        DataT: The type of the input data.
+    """
+
+    pipeline: MegatronParallel[ModelT]
+    data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]
+    forward_step_func: Callable
+    forward_only: bool
+    micro_batch_size: Optional[int] = None
+    seq_length: Optional[int] = None
+    num_microbatches: Optional[int] = None
+
+    @classmethod
+    def infer(
+        cls,
+        pipeline: MegatronParallel[ModelT],
+        data: DataT,
+        forward_step_func: Callable,
+        forward_only: bool,
+        micro_batch_size: Optional[int] = None,
+        seq_length: Optional[int] = None,
+        num_microbatches: Optional[int] = None,
+    ) -> "MegatronStep[ModelT, DataT]":
+        """
+        Creates a MegatronStep instance, inferring missing parameters if possible.
+
+        This method attempts to infer the micro_batch_size, seq_length, and num_microbatches
+        from the provided data if they are not explicitly specified.
+
+        Args:
+            pipeline (MegatronParallel[ModelT]): The Megatron parallel model pipeline.
+            data (DataT): Input data for the step.
+            forward_step_func (Callable): Function to perform the forward step.
+            forward_only (bool): If True, only perform forward pass (no backward pass).
+            micro_batch_size (Optional[int]): Size of each micro-batch.
+            seq_length (Optional[int]): Sequence length for the current step.
+            num_microbatches (Optional[int]): Number of micro-batches in this step.
+
+        Returns:
+            MegatronStep[ModelT, DataT]: An instance of MegatronStep with inferred parameters.
+        """
+        return cls(
+            pipeline=pipeline,
+            data=data,
+            forward_step_func=forward_step_func,
+            forward_only=forward_only,
+            micro_batch_size=micro_batch_size or cls.infer_micro_batch_size(data),
+            seq_length=seq_length or cls.infer_seq_length(data),
+            num_microbatches=num_microbatches or cls.infer_num_microbatches(data),
+        )
+
+    def __call__(self) -> List[Any]:
+        """
+        Executes the Megatron step.
+
+        This method performs the forward (and optionally backward) pass using the
+        configured forward_backward_func. It ensures all necessary parameters are set
+        before execution.
+
+        Returns:
+            List[Any]: The output of the forward_backward_func, typically containing
+                       loss values and other relevant information.
+
+        Raises:
+            ValueError: If any of num_microbatches, seq_length, or micro_batch_size is not set.
+        """
+        if self.num_microbatches is None:
+            raise ValueError("num_microbatches is not set")
+
+        if self.seq_length is None:
+            raise ValueError("seq_length is not set")
+
+        if self.micro_batch_size is None:
+            raise ValueError("micro_batch_size is not set")
+
+        return self.forward_backward_func(
+            forward_step_func=self.forward_step_func,
+            data_iterator=self.data_iterator,
+            model=self.model,
+            num_microbatches=self.num_microbatches,
+            seq_length=self.seq_length,
+            micro_batch_size=self.micro_batch_size,
+            forward_only=self.forward_only,
+        )
+
+    def to_data_iterator_list(
+        self, data: Union[DataT, Iterator[DataT], List[Iterator[DataT]]]
+    ) -> List[Iterator[DataT]]:
+        """
+        Converts the provided data into a list of iterators.
+
+        This method is used to convert the input data into a list of iterators that can be used
+        for data parallelism in the Megatron model. The input data can be a single data item,
+        an iterator, or a list of iterators.
+
+        Args:
+            data (Union[DataT, Iterator[DataT], List[Iterator[DataT]]]): The input data to be
+                converted into a list of iterators.
+
+        Returns:
+            List[Iterator[DataT]]: A list of iterators created from the input data.
+        """
+        if isinstance(data, Iterator):
+            return _make_data_iterator_list(self.pipeline, data)
+        elif isinstance(data, list) and all(isinstance(item, Iterator) for item in data):
+            # If data is already a list of iterators, return it as is
+            return cast(List[Iterator[DataT]], data)
+
+        # For a single data item or any other type, wrap it in an iterator and return as a list
+        return cast(List[Iterator[DataT]], [iter([data])])
+
+    @classmethod
+    def infer_micro_batch_size(cls, data: DataT) -> Optional[int]:
+        """
+        Infers the micro-batch size from the input data.
+
+        This method attempts to determine the micro-batch size by examining the first
+        dimension of the input data. It handles various data types including Tensors,
+        dictionaries, lists, and tuples.
+
+        Args:
+            data (DataT): The input data from which to infer the micro-batch size.
+
+        Returns:
+            Optional[int]: The inferred micro-batch size, or None if it cannot be determined.
+        """
+        if isinstance(data, Tensor):
+            return data.size(0)
+        elif isinstance(data, dict):
+            return cls.infer_micro_batch_size(next(iter(data.values())))
+        elif isinstance(data, (list, tuple)) and len(data) > 0:
+            _tensor: Tensor = data[0]
+            return cls.infer_micro_batch_size(_tensor)
+
+        return None
+
+    @classmethod
+    def infer_seq_length(cls, data: DataT) -> Optional[int]:
+        """
+        Infers the sequence length from the input data.
+
+        This method attempts to determine the sequence length by examining the second
+        dimension of the input data. It handles various data types including Tensors,
+        dictionaries, lists, and tuples.
+
+        Args:
+            data (DataT): The input data from which to infer the sequence length.
+
+        Returns:
+            Optional[int]: The inferred sequence length, or None if it cannot be determined.
+        """
+        if isinstance(data, Tensor):
+            # TODO: Check if at least 2 dims
+            return data.size(1)
+        elif isinstance(data, dict):
+            return cls.infer_seq_length(next(iter(data.values())))
+        elif isinstance(data, (list, tuple)) and len(data) > 0:
+            _tensor: Tensor = data[0]
+            return cls.infer_seq_length(_tensor)
+
+        return None
+
+    @classmethod
+    def infer_num_microbatches(cls, data: DataT) -> Optional[int]:
+        """
+        Infers the number of micro-batches from the input data.
+
+        Currently, this method assumes a single micro-batch for common data types.
+        It may need to be extended for more complex data structures or use cases.
+
+        Args:
+            data (DataT): The input data from which to infer the number of micro-batches.
+
+        Returns:
+            Optional[int]: The inferred number of micro-batches, or None if it cannot be determined.
+        """
+        if isinstance(data, (dict, tuple, list, Tensor)):
+            return 1
+
+        return None
+
+    @property
+    def model(self) -> Union[ModelT, List[ModelT]]:
+        """
+        Retrieves the model or list of models from the pipeline.
+
+        Returns:
+            Union[ModelT, List[ModelT]]: The model or list of models in the pipeline.
+        """
+        return self.pipeline.pipeline
+
+    @property
+    def pl_module(self) -> pl.LightningModule:
+        """
+        Retrieves the PyTorch Lightning module from the pipeline.
+
+        Returns:
+            pl.LightningModule: The PyTorch Lightning module.
+        """
+        return self.pipeline.module
+
+    @property
+    def trainer(self) -> pl.Trainer:
+        """
+        Retrieves the PyTorch Lightning trainer from the pipeline.
+
+        Returns:
+            pl.Trainer: The PyTorch Lightning trainer.
+        """
+        return self.pipeline.trainer
+
+    @functools.cached_property
+    def forward_backward_func(self) -> "MegatronStepProtocol":
+        """
+        Retrieves the forward-backward function for the Megatron model.
+
+        This property uses Megatron's scheduling to get the appropriate
+        forward-backward function based on the current configuration.
+
+        Returns:
+            MegatronStepProtocol: The function to perform forward and backward passes.
+        """
+        from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+
+        return get_forward_backward_func()
+
+    @functools.cached_property
+    def data_iterator(self) -> List[Iterator[DataT]]:
+        """
+        Cached property that converts the provided data into a list of iterators.
+
+        This property ensures that the data is converted to the required format
+        only once and then cached for subsequent uses.
+
+        Returns:
+            List[Iterator[DataT]]: A list of iterators created from the input data.
+        """
+        if self.has_global_batch_sampler:
+            batch = next(self.data)
+            if isinstance(batch, tuple) and len(batch) == 3:
+                batch = batch[0]
+            from nemo.collections.nlp.modules.common.megatron.utils import get_iterator_k_split
+
+            data = get_iterator_k_split(batch, self.num_microbatches, True)
+        else:
+            data = self.data
+        return self.to_data_iterator_list(data)
+
+    @functools.cached_property
+    def has_global_batch_sampler(self) -> bool:
+        # FIXME: cleanup the following code is here for backwards compatibility with nemo1.
+        # The "batch" sampler is a nemo1 sampler. It requires some custom code here to use
+        # (if use_global_batch_sampler), by default we shouldn't use this "batch" sampler probably.
+        if getattr(self.trainer, "datamodule", None) is not None:
+            use_global_batch_sampler = self.trainer.datamodule.data_sampler.dataloader_type == 'batch'
+        elif getattr(self.trainer, "predict_dataloaders", None) is not None:
+            from nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers import (  # noqa: I001
+                MegatronPretrainingBatchSampler,
+            )
+
+            # The batch_sampler gets injected into the dataloader by the data_sampler. When doing
+            # predict without a datamodule we can look inside the dataloader's batch_sampler to see
+            # if it is the nemo1 style sampler that we need to handle specially below.
+            use_global_batch_sampler = isinstance(
+                self.trainer.predict_dataloaders.batch_sampler, MegatronPretrainingBatchSampler
+            )
+        else:
+            use_global_batch_sampler = False
+        return use_global_batch_sampler
+
+
 class CallbackMethods:
-    def on_megatron_step_start(self, *args, **kwargs) -> None: ...
+    """
+    Defines callback methods for various stages of the Megatron model's execution.
+
+    This class outlines the structure for callbacks that can be implemented to hook into
+    different phases of the Megatron model's training or inference process. Each method
+    represents a specific point in the execution where custom logic can be inserted.
+    """
+
+    def on_megatron_step_start(self, step: MegatronStep) -> MegatronStep:
+        """
+        Called at the beginning of each Megatron step.
 
-    def on_megatron_microbatch_start(self, *args, **kwargs) -> None: ...
+        This method is invoked before any processing of the step begins. It allows for
+        any necessary setup or initialization for the step.
 
-    def on_megatron_microbatch_callback(self, *args, **kwargs) -> None: ...
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
 
-    def on_megatron_microbatch_end(self, *args, **kwargs) -> None: ...
+        Returns:
+            MegatronStep: The potentially modified MegatronStep object.
+        """
+        ...
 
-    def on_megatron_reduce_microbatches_start(self, *args, **kwargs) -> None: ...
+    def on_megatron_microbatches_start(self, step: MegatronStep) -> None:
+        """
+        Called before processing of microbatches begins.
 
-    def on_megatron_reduce_microbatches_end(self, *args, **kwargs) -> None: ...
+        This method is invoked just before the model starts processing the microbatches
+        within a step. It can be used for any preparations needed before microbatch processing.
 
-    def on_megatron_log_step_end(self, *args, **kwargs) -> None: ...
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
+        """
+        ...
+
+    def on_megatron_microbatch_start(
+        self,
+        step: MegatronStep,
+        batch: DataT,
+        forward_callback: "MegatronLossReduction",
+    ) -> None:
+        """
+        Called at the start of processing each microbatch.
+
+        This method is invoked before the forward pass of each microbatch. It provides
+        access to the current batch data and the loss reduction callback.
+
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
+            batch (DataT): The current microbatch of data being processed.
+            forward_callback (MegatronLossReduction): The callback for loss reduction.
+        """
+        ...
 
-    def on_megatron_step_end(self, *args, **kwargs) -> None: ...
+    def on_megatron_microbatch_end(
+        self,
+        step: MegatronStep,
+        batch: DataT,
+        forward_callback: "MegatronLossReduction",
+        output: Any,
+    ) -> None:
+        """
+        Called at the end of processing each microbatch.
+
+        This method is invoked after the forward pass of each microbatch. It provides
+        access to the processed batch, the loss reduction callback, and the output of the forward pass.
+
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
+            batch (DataT): The microbatch of data that was processed.
+            forward_callback (MegatronLossReduction): The callback for loss reduction.
+            output (Any): The output from the forward pass for this microbatch.
+        """
+        ...
+
+    def on_megatron_microbatches_end(self, step: MegatronStep, microbatch_outputs: List[Any]) -> None:
+        """
+        Called after all microbatches in a step have been processed.
+
+        This method is invoked once all microbatches within a step have been processed.
+        It provides access to the outputs from all microbatches.
+
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
+            microbatch_outputs (List[Any]): A list of outputs from all processed microbatches.
+        """
+        ...
+
+    def on_megatron_reduce_microbatches_start(
+        self,
+        step: MegatronStep,
+        microbatch_outputs: List[Any],
+    ) -> None:
+        """
+        Called before the reduction of microbatch outputs begins.
+
+        This method is invoked just before the model starts reducing (e.g., averaging)
+        the outputs from all microbatches. It can be used for any preparations needed
+        before the reduction process.
+
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
+            microbatch_outputs (List[Any]): A list of outputs from all processed microbatches.
+        """
+        ...
+
+    def on_megatron_reduce_microbatches_end(
+        self,
+        step: MegatronStep,
+        microbatch_outputs: List[Any],
+        loss_reduction: "MegatronLossReduction",
+        reduced: Union[torch.Tensor, Dict[str, torch.Tensor]],
+    ) -> None:
+        """
+        Called after the reduction of microbatch outputs is complete.
+
+        This method is invoked after the model has finished reducing the outputs from
+        all microbatches. It provides access to the original microbatch outputs,
+        the loss reduction object, and the final reduced output.
+
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
+            microbatch_outputs (List[Any]): A list of outputs from all processed microbatches.
+            loss_reduction (MegatronLossReduction): The object used for loss reduction.
+            reduced (Union[torch.Tensor, Dict[str, torch.Tensor]]): The final reduced output.
+        """
+        ...
+
+    def on_megatron_step_end(
+        self,
+        step: MegatronStep,
+        microbatch_outputs: List[Any],
+        reduced: Optional[Union[torch.Tensor, Dict[str, torch.Tensor]]] = None,
+    ) -> None:
+        """
+        Called at the end of each Megatron step.
+
+        This method is invoked after all processing for a step is complete. It provides
+        access to the outputs from all microbatches and the final reduced output (if available).
+
+        Args:
+            step (MegatronStep): The MegatronStep object representing the current step.
+            microbatch_outputs (List[Any]): A list of outputs from all processed microbatches.
+            reduced (Optional[Union[torch.Tensor, Dict[str, torch.Tensor]]]): The final reduced
+                output, if available. This may be None for certain configurations or pipeline stages.
+        """
+        ...
 
 
 ReductionT = TypeVar("ReductionT")
diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index 2df4ca56d1a0..6aee365a3f60 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -73,7 +73,7 @@ def __init__(
         train_time_interval: Optional[timedelta] = None,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         save_optim_on_train_end: Optional[bool] = False,
-        always_save_context: bool = False,
+        always_save_context: bool = True,
         save_context_on_train_end: bool = True,
         **kwargs,
     ):
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index a3542d9a2135..1e3cde0bbcde 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -107,6 +107,9 @@ def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str)
 
     def apply_transform(self, trainer):
         super().apply_transform(trainer)
+        self.trainable_params = set(
+            name for name, param in trainer.lightning_module.named_parameters() if param.requires_grad
+        )
 
         adapter_sharded_state_dict = {}
         if self.wrapped_io.adapter_ckpt_path is not None:
@@ -137,10 +140,6 @@ def apply_transform(self, trainer):
             if trainer.state.fn == TrainerFn.FITTING:
                 trainer.strategy.load_optimizer_state_dict(adapter_state, selective_restore=True)
 
-        self.trainable_params = set(
-            name for name, param in trainer.lightning_module.named_parameters() if param.requires_grad
-        )
-
     def adapter_key_filter(self, key: str) -> bool:
         return key in self.trainable_params or ".adapter." in key or key.endswith(".adapters")
 
diff --git a/nemo/lightning/pytorch/optim/lr_scheduler.py b/nemo/lightning/pytorch/optim/lr_scheduler.py
index 9a0f276006a7..1b78f87e595d 100644
--- a/nemo/lightning/pytorch/optim/lr_scheduler.py
+++ b/nemo/lightning/pytorch/optim/lr_scheduler.py
@@ -264,6 +264,8 @@ class WarmupAnnealingScheduler(LRSchedulerModule):
 
     def __init__(
         self,
+        warmup_steps: int = 750,
+        warmup_ratio: Optional[float] = None,
         max_steps: int = 10,
         min_lr: float = 0.0,
         interval: str = "step",
@@ -271,6 +273,8 @@ def __init__(
         monitor: str = "val_loss",
     ):
         super().__init__()
+        self.warmup_steps = warmup_steps
+        self.warmup_ratio = warmup_ratio
         self.max_steps = max_steps
         self.min_lr = min_lr
         self.interval = interval
@@ -278,7 +282,13 @@ def __init__(
         self.monitor = monitor
 
     def scheduler(self, model, optimizer):
-        lr_scheduler = WarmupAnnealing(optimizer, max_steps=self.max_steps, min_lr=self.min_lr)
+        lr_scheduler = WarmupAnnealing(
+            optimizer,
+            warmup_steps=self.warmup_steps,
+            warmup_ratio=self.warmup_ratio,
+            max_steps=self.max_steps,
+            min_lr=self.min_lr,
+        )
         return {
             "optimizer": optimizer,
             "lr_scheduler": {
diff --git a/nemo/lightning/pytorch/plugins/data_sampler.py b/nemo/lightning/pytorch/plugins/data_sampler.py
index 060ec7915ec0..4fadae8dc722 100644
--- a/nemo/lightning/pytorch/plugins/data_sampler.py
+++ b/nemo/lightning/pytorch/plugins/data_sampler.py
@@ -12,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import dataclasses
 import logging
-from typing import Any, Dict, List, Literal, Optional
+from typing import List, Literal, Optional
 
 import pytorch_lightning as pl
 from torch.utils.data import DataLoader
 
+from nemo.lightning.megatron_parallel import MegatronStep
+
 
 class DataSampler:
     def connect(self, trainer: pl.Trainer):
@@ -91,16 +94,28 @@ def compute_consumed_samples(self, steps_since_resume=0) -> int:
         return int(consumed_samples)
 
     # Megatron callbacks
-    def on_megatron_step_start(self, trainer: pl.Trainer) -> None:
+
+    def on_megatron_step_start(self, step: MegatronStep) -> MegatronStep:
+        return dataclasses.replace(
+            step,
+            seq_length=self.seq_len,
+            micro_batch_size=self.micro_batch_size,
+            num_microbatches=self.num_microbatches,
+        )
+
+    def on_megatron_microbatches_start(self, step: MegatronStep) -> None:
         # do validation and save the checkpoint when gbs is changed
         if (
             self.rampup_batch_size is not None
             and self.prev_global_batch_size != self.current_global_batch_size
             and self.prev_global_batch_size
         ):
-            trainer.should_stop = True
+            step.trainer.should_stop = True
+
+    def on_megatron_step_end(self, step: MegatronStep) -> None:
+        trainer = step.trainer
+        pl_module = step.pl_module
 
-    def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
         try:
             from megatron.core.num_microbatches_calculator import update_num_microbatches
 
@@ -136,14 +151,6 @@ def on_megatron_step_end(self, trainer: pl.Trainer, pl_module: pl.LightningModul
             )
         self.if_first_step = 1
 
-    @property
-    def megatron_data_kwargs(self) -> Dict[str, Any]:
-        return {
-            "seq_length": self.seq_len,
-            "micro_batch_size": self.micro_batch_size,
-            "num_microbatches": self.num_microbatches,
-        }
-
     @property
     def num_microbatches(self) -> int:
         try:
diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index c48bbcf8c1b1..5c318b59e54a 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -220,12 +220,12 @@ def update_config_with_dtype_overrides(dtype_config, config):
     for field in fields(dtype_config):
         if not hasattr(config, field.name):
             continue
-        # If we overwrote a value, throw a warning.
+        # If we overwrote a value, log a debug message.
         old_val = getattr(config, field.name)
         new_val = getattr(dtype_config, field.name)
         if old_val != new_val:
             setattr(config, field.name, new_val)
-            logging.warning(f"Overwrote {type(config).__name__}.{field.name}  {old_val} -> {new_val}")
+            logging.debug(f"Overwrote {type(config).__name__}.{field.name}  {old_val} -> {new_val}")
     return config
 
 
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index 841bec6ab731..6c0d7c8f6b04 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -411,7 +411,9 @@ def setup_megatron_parallel(self, trainer: pl.Trainer) -> None:
             self.setup_optimizers(trainer)
 
         self.model = self.megatron_parallel
-        self.model.callbacks.add(getattr(trainer, "callbacks"))
+        trainer_callbacks = getattr(trainer, "callbacks", None)
+        if trainer_callbacks:
+            self.model.callbacks.add(*trainer_callbacks)
 
         if self.data_sampler:
             self.model.callbacks.add(self.data_sampler)
@@ -480,6 +482,17 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
 
             out = self.model(dataloader_iter, forward_only=False, *args, **kwargs)
 
+            if torch.is_tensor(out):
+                reduced_train_loss = out
+            else:
+                if not isinstance(out, dict):
+                    raise ValueError(f"Expected dict or tensor for reduced_train_loss, got {type(out)}")
+
+                if "loss" not in out:
+                    raise ValueError(f"Expected 'loss' in output dict, got {out.keys()}")
+
+                reduced_train_loss = out["loss"]
+
             self.lightning_module.log(
                 "global_step",
                 self.trainer.global_step,
@@ -511,8 +524,10 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
             if self.log_train_loss:
                 # p2p now, broadcast later at ckpt. only with pp, some ranks will log 0.0
                 # WHICH IS OK because we broadcast later at checkpoint time
-                _strategy_lib._sync_from_last_pipeline_stage(out, broadcast=False)
-                self.lightning_module.log("reduced_train_loss", out, prog_bar=True, batch_size=1, sync_dist=False)
+                _strategy_lib._sync_from_last_pipeline_stage(reduced_train_loss, broadcast=False)
+                self.lightning_module.log(
+                    "reduced_train_loss", reduced_train_loss, prog_bar=True, batch_size=1, sync_dist=False
+                )
 
             return out
 
@@ -601,7 +616,6 @@ def _update_step_kwargs(self, dataloader_iter, kwargs, step_name: str):
             kwargs["forward_step"] = self._get_forward_step(step_name)
         if "loss_reduction" not in kwargs:
             kwargs["loss_reduction"] = self._get_loss_reduction(step_name)
-        kwargs.update(self._data_config_kwargs(dataloader_iter))
 
         return kwargs
 
@@ -781,13 +795,6 @@ def _get_loss_reduction(self, step_type: str) -> Optional[_ModuleStepFunction]:
 
         return None
 
-    def _data_config_kwargs(self, dataloader_iter) -> Dict[str, Any]:
-        if not hasattr(dataloader_iter, "data_config") and self.data_sampler:
-            if hasattr(self.data_sampler, "megatron_data_kwargs"):
-                return self.data_sampler.megatron_data_kwargs
-
-        return {}
-
     @property
     def distributed_sampler_kwargs(self) -> Dict[str, Any]:
         from nemo.utils import AppState
diff --git a/nemo/utils/cast_utils.py b/nemo/utils/cast_utils.py
index a7960be4cc4d..72d6c5c496d9 100644
--- a/nemo/utils/cast_utils.py
+++ b/nemo/utils/cast_utils.py
@@ -24,7 +24,7 @@ def avoid_bfloat16_autocast_context():
     """
 
     if torch.is_autocast_enabled() and torch.get_autocast_gpu_dtype() == torch.bfloat16:
-        return torch.cuda.amp.autocast(dtype=torch.float32)
+        return torch.amp.autocast('cuda', dtype=torch.float32)
     else:
         return nullcontext()
 
@@ -37,12 +37,12 @@ def avoid_float16_autocast_context():
 
     if torch.is_autocast_enabled() and torch.get_autocast_gpu_dtype() == torch.float16:
         if torch.jit.is_scripting() or torch.jit.is_tracing():
-            return torch.cuda.amp.autocast(dtype=torch.float32)
+            return torch.amp.autocast('cuda', dtype=torch.float32)
 
         if torch.cuda.is_bf16_supported():
-            return torch.cuda.amp.autocast(dtype=torch.bfloat16)
+            return torch.amp.autocast('cuda', dtype=torch.bfloat16)
         else:
-            return torch.cuda.amp.autocast(dtype=torch.float32)
+            return torch.amp.autocast('cuda', dtype=torch.float32)
     else:
         return nullcontext()
 
@@ -71,7 +71,7 @@ def __init__(self, mod):
 
     def forward(self, x):
         if torch.is_autocast_enabled() and x.dtype != torch.float32:
-            with torch.cuda.amp.autocast(enabled=False):
+            with torch.amp.autocast(x.device.type, enabled=False):
                 ret = self.mod.forward(x.to(torch.float32)).to(x.dtype)
         else:
             ret = self.mod.forward(x)
@@ -86,7 +86,7 @@ def __init__(self, mod):
     def forward(self, *args):
         if torch.is_autocast_enabled():
             from_dtype = args[0].dtype
-            with torch.cuda.amp.autocast(enabled=False):
+            with torch.amp.autocast(self.device.type, enabled=False):
                 ret = self.mod.forward(*cast_all(args, from_dtype=from_dtype, to_dtype=torch.float32))
                 return cast_all(ret, from_dtype=torch.float32, to_dtype=from_dtype)
         else:
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index ca60010c6fda..543c7e0781d2 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -116,7 +116,7 @@ class CallbackParams:
     auto_insert_metric_name: bool = True
     every_n_epochs: Optional[int] = 1
     every_n_train_steps: Optional[int] = None
-    train_time_interval: Optional[str] = None
+    train_time_interval: Optional[Any] = None
     prefix: Optional[str] = None  # If None, exp_manager will attempt to handle the filepath
     postfix: str = ".nemo"
     save_best_model: bool = False
@@ -374,6 +374,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want
                 a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None.
             - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize
+            - train_time_interval (timedelta): pass an object of timedelta to save the model every timedelta. Defaults to None.
+                (use _target_ with hydra to achieve this)
 
     returns:
         log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of
diff --git a/nemo/utils/export_utils.py b/nemo/utils/export_utils.py
index 534598097bf4..8bc01f652188 100644
--- a/nemo/utils/export_utils.py
+++ b/nemo/utils/export_utils.py
@@ -149,7 +149,7 @@ def verify_torchscript(model, output, input_examples, check_tolerance=0.01):
     for input_example in input_examples:
         input_list, input_dict = parse_input_example(input_example)
         # We disable autocast here to make sure exported TS will run under Triton or other C++ env
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch.amp.autocast('cuda', enabled=False):
             output_example = model.forward(*input_list, **input_dict)
             ts_model = torch.jit.load(output)
             all_good = all_good and run_ts_and_compare(
diff --git a/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py b/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py
index d9275fd26fe9..4c62a62e31b6 100644
--- a/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py
+++ b/scripts/asr_context_biasing/eval_greedy_decoding_with_context_biasing.py
@@ -254,7 +254,9 @@ def decoding_step(
                         probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype
                     )
                 best_hyp_batch, beams_batch = asr_model.decoding.rnnt_decoder_predictions_tensor(
-                    packed_batch, probs_lens, return_hypotheses=True,
+                    packed_batch,
+                    probs_lens,
+                    return_hypotheses=True,
                 )
             beams_batch = [[x] for x in best_hyp_batch]
 
@@ -356,17 +358,8 @@ def main(cfg: EvalContextBiasingConfig):
             durations.append(data['duration'])
             audio_file_paths.append(str(audio_file.absolute()))
 
-    if cfg.use_amp:
-        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-            logging.info("AMP is enabled!\n")
-            autocast = torch.cuda.amp.autocast
-        else:
-            autocast = contextlib.nullcontext
-    else:
-        autocast = contextlib.nullcontext
-
     # manual calculation of encoder_embeddings
-    with autocast():
+    with torch.amp.autocast(asr_model.device.type, enabled=cfg.use_amp):
         with torch.no_grad():
             asr_model.eval()
             asr_model.encoder.freeze()
diff --git a/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py b/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py
index d0b4b1a61204..f2fbebd1bf4a 100644
--- a/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py
+++ b/scripts/asr_language_modeling/neural_rescorer/eval_neural_rescorer.py
@@ -224,23 +224,13 @@ def main():
     dataset = BeamScoresDataset(args.beams_file, model_tokenizer, args.eval_manifest, args.beam_size, max_seq_length)
     data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=args.batch_size)
 
-    if args.use_amp:
-        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-            logging.info("AMP is enabled!\n")
-            autocast = torch.cuda.amp.autocast
-    else:
-
-        @contextlib.contextmanager
-        def autocast():
-            yield
-
     if "attention_mask" in inspect.getfullargspec(model.forward).args:
         support_att_mask = True
     else:
         support_att_mask = False
     logging.info(f"Rescoring with beam_size: {args.beam_size}")
     logging.info("Calculating the scores...")
-    with autocast():
+    with torch.amp.autocast(model.device.type, enabled=args.use_amp):
         with torch.no_grad():
             am_scores, lm_scores, dists, ref_lens, lens_in_chars = [], [], [], [], []
             for batch in tqdm.tqdm(data_loader):
diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
index 2af8283c7b82..3bb4fa4f4846 100644
--- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
+++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_ctc.py
@@ -194,7 +194,9 @@ def beam_search_eval(
                 )
 
             _, beams_batch = decoding.ctc_decoder_predictions_tensor(
-                packed_batch, decoder_lengths=probs_lens, return_hypotheses=True,
+                packed_batch,
+                decoder_lengths=probs_lens,
+                return_hypotheses=True,
             )
 
         for beams_idx, beams in enumerate(beams_batch):
@@ -312,22 +314,7 @@ def main(cfg: EvalBeamSearchNGramConfig):
             )
     else:
 
-        @contextlib.contextmanager
-        def default_autocast():
-            yield
-
-        if cfg.use_amp:
-            if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-                logging.info("AMP is enabled!\n")
-                autocast = torch.cuda.amp.autocast
-
-            else:
-                autocast = default_autocast
-        else:
-
-            autocast = default_autocast
-
-        with autocast():
+        with torch.amp.autocast(asr_model.device.type, enabled=cfg.use_amp):
             with torch.no_grad():
                 if isinstance(asr_model, EncDecHybridRNNTCTCModel):
                     asr_model.cur_decoder = 'ctc'
diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py
index 8548b839024f..c61a402c0942 100644
--- a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py
+++ b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram_transducer.py
@@ -177,7 +177,9 @@ def decoding_step(
                     probs_batch[prob_index].unsqueeze(0), device=packed_batch.device, dtype=packed_batch.dtype
                 )
             best_hyp_batch, beams_batch = model.decoding.rnnt_decoder_predictions_tensor(
-                packed_batch, probs_lens, return_hypotheses=True,
+                packed_batch,
+                probs_lens,
+                return_hypotheses=True,
             )
         if cfg.decoding_strategy == "greedy_batch":
             beams_batch = [[x] for x in best_hyp_batch]
@@ -296,23 +298,8 @@ def main(cfg: EvalBeamSearchNGramConfig):
             )
     else:
 
-        @contextlib.contextmanager
-        def default_autocast():
-            yield
-
-        if cfg.use_amp:
-            if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-                logging.info("AMP is enabled!\n")
-                autocast = torch.cuda.amp.autocast
-
-            else:
-                autocast = default_autocast
-        else:
-
-            autocast = default_autocast
-
         # manual calculation of encoder_embeddings
-        with autocast():
+        with torch.amp.autocast(asr_model.device.type, enabled=cfg.use_amp):
             with torch.no_grad():
                 asr_model.eval()
                 asr_model.encoder.freeze()
diff --git a/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
index a1db7cec4f23..63ab24b0921e 100644
--- a/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
+++ b/scripts/asr_language_modeling/ngram_lm/eval_wfst_decoding_ctc.py
@@ -300,22 +300,7 @@ def main(cfg: EvalWFSTNGramConfig):
             )
     else:
 
-        @contextlib.contextmanager
-        def default_autocast():
-            yield
-
-        if cfg.use_amp:
-            if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-                logging.info("AMP is enabled!\n")
-                autocast = torch.cuda.amp.autocast
-
-            else:
-                autocast = default_autocast
-        else:
-
-            autocast = default_autocast
-
-        with autocast():
+        with torch.amp.autocast(asr_model.device.type, enabled=cfg.use_amp):
             with torch.no_grad():
                 if isinstance(asr_model, EncDecHybridRNNTCTCModel):
                     asr_model.cur_decoder = 'ctc'
diff --git a/scripts/deploy/multimodal/deploy_triton.py b/scripts/deploy/multimodal/deploy_triton.py
index 18463a3fc24a..1d0c755c12d8 100755
--- a/scripts/deploy/multimodal/deploy_triton.py
+++ b/scripts/deploy/multimodal/deploy_triton.py
@@ -101,6 +101,37 @@ def get_args(argv):
         type=int,
         help="Max batch size of the visual inputs, for lita/vita model with video inference, this should be set to 256",
     )
+    parser.add_argument(
+        '--use_lora_plugin',
+        nargs='?',
+        const=None,
+        choices=['float16', 'float32', 'bfloat16'],
+        help="Activates the lora plugin which enables embedding sharing.",
+    )
+    parser.add_argument(
+        '--lora_target_modules',
+        nargs='+',
+        default=None,
+        choices=[
+            "attn_qkv",
+            "attn_q",
+            "attn_k",
+            "attn_v",
+            "attn_dense",
+            "mlp_h_to_4h",
+            "mlp_gate",
+            "mlp_4h_to_h",
+        ],
+        help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
+    )
+    parser.add_argument(
+        '--max_lora_rank',
+        type=int,
+        default=64,
+        help='maximum lora rank for different lora modules. '
+        'It is used to compute the workspace size of lora plugin.',
+    )
+    parser.add_argument("--lora_checkpoint_path", default=None, type=str, help="The checkpoint path of LoRA weights")
     args = parser.parse_args(argv)
     return args
 
@@ -133,7 +164,9 @@ def get_trt_deployable(args):
         raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
 
     exporter = TensorRTMMExporter(
-        model_dir=trt_path, load_model=(args.visual_checkpoint is None), modality=args.modality
+        model_dir=trt_path,
+        load_model=(args.visual_checkpoint is None),
+        modality=args.modality,
     )
 
     if args.visual_checkpoint is not None:
@@ -151,6 +184,10 @@ def get_trt_deployable(args):
                 max_batch_size=args.max_batch_size,
                 max_multimodal_len=args.max_multimodal_len,
                 dtype=args.dtype,
+                use_lora_plugin=args.use_lora_plugin,
+                lora_target_modules=args.lora_target_modules,
+                max_lora_rank=args.max_lora_rank,
+                lora_checkpoint_path=args.lora_checkpoint_path,
             )
         except Exception as error:
             raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
diff --git a/scripts/deploy/multimodal/query.py b/scripts/deploy/multimodal/query.py
index 955d708730ac..bcc1e343df6a 100644
--- a/scripts/deploy/multimodal/query.py
+++ b/scripts/deploy/multimodal/query.py
@@ -36,6 +36,14 @@ def get_args(argv):
     parser.add_argument("-rp", "--repetition_penalty", default=1.0, type=float, help="repetition_penalty")
     parser.add_argument("-nb", "--num_beams", default=1, type=int, help="num_beams")
     parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")
+    parser.add_argument(
+        "-lt",
+        "--lora_task_uids",
+        default=None,
+        type=str,
+        nargs="+",
+        help="The list of LoRA task uids; use -1 to disable the LoRA module",
+    )
 
     args = parser.parse_args(argv)
     return args
@@ -55,5 +63,6 @@ def get_args(argv):
         repetition_penalty=args.repetition_penalty,
         num_beams=args.num_beams,
         init_timeout=args.init_timeout,
+        lora_uids=args.lora_task_uids,
     )
     print(output)
diff --git a/scripts/export.py b/scripts/export.py
index dbe5b2b7fe2b..acfd3e3e3450 100644
--- a/scripts/export.py
+++ b/scripts/export.py
@@ -48,7 +48,8 @@
 
 def get_args(argv):
     parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=f"Export NeMo models to ONNX/Torchscript",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description=f"Export NeMo models to ONNX/Torchscript",
     )
     parser.add_argument("source", help="Source .nemo file")
     parser.add_argument("out", help="Location to write result to")
@@ -154,11 +155,8 @@ def nemo_export(argv):
             kv[k] = v
         model.set_export_config(kv)
 
-    autocast = nullcontext
-    if args.autocast:
-        autocast = torch.cuda.amp.autocast
     try:
-        with autocast(), torch.no_grad(), torch.inference_mode():
+        with torch.amp.autocast(args.device, enabled=args.autocast), torch.no_grad(), torch.inference_mode():
             model.to(device=args.device).freeze()
             model.eval()
             input_example = None
diff --git a/scripts/speech_recognition/confidence/benchmark_asr_confidence.py b/scripts/speech_recognition/confidence/benchmark_asr_confidence.py
index 0c119b02ff7b..9c42ef6cca5b 100644
--- a/scripts/speech_recognition/confidence/benchmark_asr_confidence.py
+++ b/scripts/speech_recognition/confidence/benchmark_asr_confidence.py
@@ -209,12 +209,6 @@ def main(cfg: ConfidenceBenchmarkingConfig):
             filepaths.append(str(audio_file.absolute()))
             reference_texts.append(item['text'])
 
-    # setup AMP (optional)
-    autocast = None
-    if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'):
-        logging.info("AMP enabled!\n")
-        autocast = torch.cuda.amp.autocast
-
     # do grid-based benchmarking if grid_params is provided, otherwise a regular one
     work_dir = Path(cfg.output_dir)
     os.makedirs(work_dir, exist_ok=True)
@@ -275,7 +269,7 @@ def main(cfg: ConfidenceBenchmarkingConfig):
                     cfg.batch_size,
                     cfg.num_workers,
                     plot_dir,
-                    autocast,
+                    cfg.amp,
                 )
                 for level, result in results.items():
                     f.write(f"{model_typename},{','.join(param_list)},{level},{','.join([str(r) for r in result])}\n")
@@ -303,7 +297,7 @@ def main(cfg: ConfidenceBenchmarkingConfig):
                 filepaths,
                 reference_texts,
                 plot_dir,
-                autocast,
+                cfg.amp,
             )
             for level, result in results.items():
                 f.write(f"{model_typename},{','.join(param_list)},{level},{','.join([str(r) for r in result])}\n")
diff --git a/tests/collections/llm/gpt/model/test_model_import.py b/tests/collections/llm/gpt/model/test_model_import.py
new file mode 100644
index 000000000000..3f65cc22654f
--- /dev/null
+++ b/tests/collections/llm/gpt/model/test_model_import.py
@@ -0,0 +1,85 @@
+import torch
+
+torch.set_grad_enabled(False)
+
+
+config_name_to_hf_id = {
+    'MistralConfig7B': 'mistralai/Mistral-7B-v0.1',
+    # 'Nemotron3Config4B': 'nvidia/Minitron-4B-Base',
+    'Llama2Config7B': 'meta-llama/Llama-2-7b-hf',
+    'Llama3Config8B': 'meta-llama/Meta-Llama-3-8B',
+    # 'MixtralConfig8x7B': 'mistralai/Mixtral-8x7B-v0.1',
+    # 'ChatGLM2Config6B': 'THUDM/chatglm2-6b',
+    'GemmaConfig2B': 'google/gemma-2b',
+    # 'Baichuan2Config7B': 'baichuan-inc/Baichuan2-7B-Base',
+}
+
+
+def strip_digits_from_end(s):
+    s = list(s)
+    while s and s[-1].isdigit():
+        s = s[:-1]
+    return ''.join(s)
+
+
+def get_modulename_from_config_name(config_name):
+    # Finds name of model class from config class name.
+    # Llama2Config7B -> Llama2Model (fail) -> LlamaModel
+    import nemo.collections.llm.gpt.model as nemo_ux_llms
+
+    assert 'Config' in config_name, 'Expected config_name to contain "Config".'
+    module_name = config_name.split('Config')[0] + "Model"
+    if not hasattr(nemo_ux_llms, module_name):
+        module_name = strip_digits_from_end(config_name.split('Config')[0]) + "Model"
+    if not hasattr(nemo_ux_llms, module_name):
+        raise ValueError("Failed to get modulename")
+    return module_name
+
+
+def generate_twolayer_checkpoints(config_name, hf_id):
+    from transformers import AutoConfig, AutoModel, AutoTokenizer
+
+    config = AutoConfig.from_pretrained(hf_id, trust_remote_code=True)
+    # Reduce number of layers to two.
+    if hasattr(config, 'num_hidden_layers'):
+        print(config.num_hidden_layers)
+        config.num_hidden_layers = 2
+    elif hasattr(config, 'num_layers'):
+        print(config.num_layers)
+        config.num_layers = 2
+    else:
+        print(config)
+        raise ValueError("HF config has neither num_hidden_layers nor num_layers")
+
+    # Calling random init is slow.
+    with torch.device('meta'):
+        model_2l = AutoModel.from_config(config, trust_remote_code=True)
+
+    model_2l = model_2l.to_empty(device='cpu')
+    state = model_2l.state_dict()
+    # Fill state-dict with i/n
+    n = len(state.items())
+    for i, key in enumerate(state.keys()):
+        value = torch.empty_like(state[key]).fill_(i / n)
+        state[key] = value
+    model_2l.load_state_dict(state)
+    model_2l.save_pretrained(f'hf_ckpts/{config_name}/', safe_serialization=False)
+    hf_tokenizer = AutoTokenizer.from_pretrained(hf_id, trust_remote_code=True)
+    hf_tokenizer.save_pretrained(f'hf_ckpts/{config_name}/', trust_remote_code=True)
+
+
+def import_from_hf(config_name, hf_path):
+    import nemo.collections.llm.gpt.model as nemo_ux_llms
+    from nemo.collections.llm import import_ckpt
+
+    module_name = get_modulename_from_config_name(config_name)
+    config_cls = getattr(nemo_ux_llms, config_name)
+    model_cls = getattr(nemo_ux_llms, module_name)
+    model = model_cls(config_cls())
+    import_ckpt(model=model, source=hf_path)
+
+
+if __name__ == '__main__':
+    for config_name, hf_id in config_name_to_hf_id.items():
+        src = f'hf:///home/TestData/nemo2_ckpt/{config_name}'
+        import_from_hf(config_name, src)
diff --git a/tests/collections/llm/gpt_finetuning.py b/tests/collections/llm/gpt_finetuning.py
new file mode 100644
index 000000000000..09050595aebe
--- /dev/null
+++ b/tests/collections/llm/gpt_finetuning.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from dataclasses import dataclass
+
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+## NOTE: This script is present for github-actions testing only.
+
+
+@dataclass
+class Llama3Config96M(llm.Llama3Config8B):
+    seq_length: int = 2048
+    num_layers: int = 2
+    hidden_size: int = 768
+    ffn_hidden_size: int = 3072
+    num_attention_heads: int = 8
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Finetune a small GPT model using NeMo 2.0')
+    parser.add_argument('--restore_path', type=str, help="Path to model to be finetuned")
+    parser.add_argument('--experiment_dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument('--devices', type=int, default=1, help="number of devices")
+    parser.add_argument('--mbs', type=int, default=1, help="micro batch size")
+    parser.add_argument('--tp_size', type=int, default=1, help="tensor parallel size")
+    parser.add_argument('--pp_size', type=int, default=1, help="pipeline parallel size")
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = get_args()
+
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=args.tp_size,
+        pipeline_parallel_size=args.pp_size,
+    )
+
+    trainer = nl.Trainer(
+        devices=args.devices,
+        max_steps=2,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        log_every_n_steps=1,
+        limit_val_batches=2,
+        val_check_interval=2,
+        num_sanity_val_steps=0,
+    )
+
+    ckpt = nl.ModelCheckpoint(
+        save_last=True,
+        monitor="reduced_train_loss",
+        save_top_k=1,
+        save_on_train_epoch_end=True,
+        save_optim_on_train_end=True,
+    )
+
+    logger = nl.NeMoLogger(
+        log_dir=args.experiment_dir,
+        use_datetime_version=False,  # must be false if using auto resume
+        ckpt=ckpt,
+    )
+
+    adam = nl.MegatronOptimizerModule(
+        config=OptimizerConfig(
+            optimizer="adam",
+            lr=0.0001,
+            adam_beta2=0.98,
+            use_distributed_optimizer=True,
+            clip_grad=1.0,
+            bf16=True,
+        ),
+    )
+
+    lora = llm.peft.LoRA()
+
+    squad = llm.SquadDataModule(seq_length=2048, micro_batch_size=args.mbs, global_batch_size=8, num_workers=0)
+
+    tokenizer = get_nmt_tokenizer(
+        tokenizer_model="/lustre/fsw/coreai_dlalgo_llm/nemo_home/models/llama_96M/dummy_tokenizer.model"
+    )
+    llama3_8b = llm.LlamaModel(Llama3Config96M(), tokenizer=tokenizer)
+
+    resume = nl.AutoResume(
+        restore_config=nl.RestoreConfig(path=args.restore_path),
+        resume_if_exists=True,
+    )
+
+    llm.finetune(
+        model=llama3_8b,
+        data=squad,
+        trainer=trainer,
+        peft=lora,
+        log=logger,
+        optim=adam,
+        resume=resume,
+    )
diff --git a/tests/collections/llm/megatron_t5_pretraining.py b/tests/collections/llm/megatron_t5_pretraining.py
new file mode 100644
index 000000000000..407a1d3ab96e
--- /dev/null
+++ b/tests/collections/llm/megatron_t5_pretraining.py
@@ -0,0 +1,141 @@
+## NOTE: This script is present for github-actions testing only.
+## There are no guarantees that this script is up-to-date with latest NeMo.
+
+import argparse
+
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import WandbLogger
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import train
+from nemo.collections.llm.t5.data import PreTrainingDataModule
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning import NeMoLogger
+from nemo.lightning.pytorch.callbacks import ModelCheckpoint
+from nemo.lightning.pytorch.optim.lr_scheduler import WarmupAnnealingScheduler
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Train a small T5 model using NeMo 2.0')
+    parser.add_argument('--devices', type=int, help="Number of devices to use for training")
+    parser.add_argument('--max-steps', type=int, help="Number of steps to train for")
+    parser.add_argument('--experiment-dir', type=str, help="directory to write results and checkpoints to")
+    parser.add_argument('--experiment-name', type=str, help="name of experiment")
+    parser.add_argument('--wandb-project', type=str, default=None, help="wandb project name")
+    parser.add_argument('--data-path', type=str, help="Path to data file")
+    parser.add_argument('--vocab-path', type=str, default=None, help="Path to vocab file")
+    parser.add_argument('--index-mapping-dir', type=str, help="directory to write index mappings to")
+
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+
+    args = get_args()
+
+    tokenizer = get_nmt_tokenizer(
+        "megatron",
+        "BertWordPieceCase",
+        vocab_file=args.vocab_path,
+    )
+    data = PreTrainingDataModule(
+        paths=args.data_path,
+        seq_length=512,
+        seq_length_dec=128,
+        micro_batch_size=64,
+        global_batch_size=512,
+        seed=1234,
+        tokenizer=tokenizer,
+        split="99982,9,9",
+        index_mapping_dir=args.index_mapping_dir,
+    )
+    t5_config = llm.t5.model.t5.T5Config(
+        num_layers=12,
+        encoder_num_layers=12,
+        hidden_size=768,
+        ffn_hidden_size=3072,
+        num_attention_heads=12,
+        kv_channels=64,
+        init_method_std=0.015,
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        layernorm_epsilon=1e-5,
+        make_vocab_size_divisible_by=128,
+        max_position_embeddings=512,
+        bf16=True,
+        params_dtype=torch.bfloat16,
+        pipeline_dtype=torch.bfloat16,
+    )
+    model = llm.t5.model.t5.T5Model(t5_config, tokenizer=data.tokenizer)
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=1,
+        pipeline_model_parallel_size=1,
+        pipeline_dtype=None,
+    )
+    checkpoint_callback = ModelCheckpoint(
+        every_n_train_steps=5000,
+    )
+    callbacks = [checkpoint_callback]
+
+    resume = nl.AutoResume(
+        resume_if_exists=True,
+        resume_ignore_no_checkpoint=True,
+    )
+
+    opt_config = OptimizerConfig(
+        optimizer='adam',
+        lr=0.0001,
+        use_distributed_optimizer=False,
+        bf16=True,
+        weight_decay=0.01,
+    )
+    lr_scheduler = WarmupAnnealingScheduler(
+        warmup_steps=None,
+        warmup_ratio=0.01,
+        max_steps=args.max_steps,
+        min_lr=0.00001,
+    )
+    opt = MegatronOptimizerModule(
+        config=opt_config,
+        lr_scheduler=lr_scheduler,
+    )
+
+    trainer = nl.Trainer(
+        devices=args.devices,
+        max_steps=args.max_steps,
+        accelerator="gpu",
+        strategy=strategy,
+        callbacks=callbacks,
+        log_every_n_steps=1,
+        limit_val_batches=2,
+        val_check_interval=2,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+    if args.wandb_project is not None:
+        wandb_logger = WandbLogger(
+            name=args.experiment_name,
+            project=args.wandb_project,
+            log_model="all",
+        )
+    else:
+        wandb_logger = None
+    nemo_logger = NeMoLogger(
+        name=args.experiment_name,
+        use_datetime_version=False,
+        log_dir=args.experiment_dir,
+        wandb=wandb_logger,
+    )
+
+    train(
+        model=model,
+        resume=resume,
+        data=data,
+        trainer=trainer,
+        log=nemo_logger,
+        tokenizer='data',
+        optim=opt,
+    )
diff --git a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
index 7bd36e6b6ad8..852b3e838d5c 100644
--- a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
@@ -386,7 +386,7 @@
       },
       "outputs": [],
       "source": [
-        "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import get_prompt_format_fn, registered_prompt_format_fn"
+        "from nemo.collections.common.prompts.fn import get_prompt_format_fn, registered_prompt_format_fn"
       ]
     },
     {
@@ -707,7 +707,7 @@
         "from lhotse.dataset.collation import collate_vectors\n",
         "\n",
         "from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper\n",
-        "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextLhotseDataset, get_prompt_format_fn\n",
+        "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextLhotseDataset\n",
         "\n",
         "class MyCanaryPromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):\n",
         "    \"\"\"\n",
diff --git a/tutorials/nlp/lora.ipynb b/tutorials/nlp/lora.ipynb
index a8e3138ba8ef..c67fa6c2de15 100644
--- a/tutorials/nlp/lora.ipynb
+++ b/tutorials/nlp/lora.ipynb
@@ -2,32 +2,35 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
    "source": [
     "Currently, this notebook must be run in a NeMo container.\n",
     "An example command to launch the container:\n",
     "```bash\n",
     "docker run --gpus all -it --rm -v <your_nemo_dir>:/NeMo --shm-size=8g -p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit stack=67108864 <your_nemo_container>\n",
     "```"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "# Update megatron version to the newest.\n",
     "!cd /workspace && python -m pip install -e git+https://github.com/NVIDIA/Megatron-LM#egg=megatron-core"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "%cd /NeMo/tutorials/nlp\n",
@@ -36,10 +39,7 @@
     "import wget\n",
     "import sys\n",
     "sys.path.insert(0, \"../..\")  # find the local nemo first before the installed nemo"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "attachments": {},
@@ -325,13 +325,13 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "!wget  -nc --content-disposition {megatron_gpt_345m_nemo_url} -O {NEMO_DIR}/{gpt_file_name}"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "attachments": {},
@@ -537,6 +537,9 @@
   },
   {
    "cell_type": "markdown",
+   "metadata": {
+    "collapsed": false
+   },
    "source": [
     "Simply substitute with the `MegatronT5SFTModel` class to use T5 instead of GPT.\n",
     "\n",
@@ -544,10 +547,7 @@
     "`model.add_adapter([LoraPEFTConfig(model_cfg), PtuningPEFTConfig(model_cfg)])`\n",
     "\n",
     "We're now ready to start training."
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
@@ -597,6 +597,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "# reimport libraries and classes in case one wants to only run cells from the Inference section\n",
@@ -612,31 +615,28 @@
     "DATA_DIR = \"data\"\n",
     "CONFIG_DIR = os.path.join(NEMO_DIR, \"conf\")\n",
     "SQUAD_DIR = os.path.join(DATA_DIR, \"SQuAD\")\n"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "First, we will load and modify a config file that will be used for inference.\n"
-   ],
    "metadata": {
     "collapsed": false
-   }
+   },
+   "source": [
+    "First, we will load and modify a config file that will be used for inference.\n"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
     "# Download the example config file\n",
     "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/tuning/conf/megatron_gpt_generate_config.yaml', CONFIG_DIR)"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "cell_type": "code",
@@ -711,30 +711,30 @@
   },
   {
    "cell_type": "markdown",
-   "source": [
-    "The cell below is required if you are running the notebook end-to-end, and if you use a different batch size for training and evaluation. In this case, the microbatch calculator needs to be rest. If you are running training only or inference only, feel free to ignore this cell."
-   ],
    "metadata": {
     "collapsed": false
-   }
+   },
+   "source": [
+    "The cell below is required if you are running the notebook end-to-end, and if you use a different batch size for training and evaluation. In this case, the microbatch calculator needs to be rest. If you are running training only or inference only, feel free to ignore this cell."
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [],
    "source": [
-    "from nemo.utils.apex_utils import _reconfigure_microbatch_calculator\n",
-    "_reconfigure_microbatch_calculator(\n",
+    "from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator\n",
+    "reconfigure_num_microbatches_calculator(\n",
     "    rank=0,\n",
     "    rampup_batch_size=None,\n",
     "    global_batch_size=config_eval.model.global_batch_size,\n",
     "    micro_batch_size=config_eval.model.micro_batch_size,\n",
     "    data_parallel_size=1,\n",
     ")"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
+   ]
   },
   {
    "attachments": {},