From 6400bd57ac6b52a7fee069995fa488749f89014f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 23 Sep 2024 20:28:24 +0200
Subject: [PATCH 01/14] ci: Send direct alert on failed cherry-pick (#10588)

* ci: Send direct alert on failed cherry-pick

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* also to me

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cherry-pick-release-commit.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 48a7c4684a7e..949f4e5e2125 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -31,6 +31,7 @@ jobs:
           SHA=$(git rev-list --no-merges -n 1 HEAD)
           MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
           PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' )
+          USERNAME=$(git log -n 1 --pretty=format:%ae $SHA | awk -F'@' '{print $1}')
 
           PR=$(curl -L \
             -H "Accept: application/vnd.github+json" \
@@ -110,7 +111,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'$PR_ID'> failed"
+                      "text": ":alert: Cherrypick bot 🤖: Hey @'$USERNAME': Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: @okoenig"
                     }
                   }
                 ]

From 7439b13857ee00a7c134d334aab17001a034b373 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Mon, 23 Sep 2024 15:12:28 -0500
Subject: [PATCH 02/14] Add ConfigValidation plugin to nemo.lightning.run
 (#10541)

* Add validation plugin to nemo.lightning.run

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Rename to ConfigValidationPlugin

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add tests for each recipes with plugins

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>

* Add nsys plugin test

Signed-off-by: Hemil Desai <hemild@nvidia.com>

* Add more tests

Signed-off-by: Hemil Desai <hemild@nvidia.com>

---------

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
---
 nemo/lightning/run/plugins.py    |  68 +++++++++++++++-
 tests/lightning/test_nemo_run.py | 131 +++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+), 2 deletions(-)
 create mode 100644 tests/lightning/test_nemo_run.py

diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py
index 18850c9d607e..61da7303d9cb 100644
--- a/nemo/lightning/run/plugins.py
+++ b/nemo/lightning/run/plugins.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import copy
-import logging
 import os
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -35,7 +34,7 @@
 
 def _merge_callbacks(partial: run.Partial, callbacks: list[run.Config[Callback]]):
     if hasattr(partial, "trainer"):
-        if hasattr(partial.trainer, "callbacks"):
+        if hasattr(partial.trainer, "callbacks") and partial.trainer.callbacks:
             for callback in callbacks:
                 if callback not in partial.trainer.callbacks:
                     partial.trainer.callbacks.append(callback)
@@ -177,3 +176,68 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor):
             logging.warning(
                 f"The {self.__class__.__name__} will have no effect as WANDB_API_KEY environment variable is not set."
             )
+
+
+@dataclass(kw_only=True)
+class ConfigValidationPlugin(run.Plugin):
+    """
+    A plugin for validating a NeMo task with its executor.
+
+    This plugin is used to ensure that the NeMo environment, task, and executor meet certain criteria.
+    The validation checks include preemption, checkpoint directory,
+    serialization, and Weights and Biases (wandb) integration.
+
+    Attributes:
+        validate_preemption (bool): Whether to validate the preemption callback. If set to True, the plugin will
+            assert that the task has a `PreemptionCallback`. Defaults to True.
+        validate_checkpoint_dir (bool): Whether to validate the checkpoint directory. If set to True and the executor
+            is a `SlurmExecutor`, the plugin will assert that the task's log directory exists in the mounts
+            specified in the `SlurmExecutor`. Defaults to True.
+        validate_serialization (bool): Whether to validate task serialization. If set to True, the plugin will
+            assert that the task can be successfully serialized and deserialized using NeMo-Run's
+            `ZlibJSONSerializer`. Defaults to True.
+        validate_wandb (bool): Whether to validate Weights and Biases integration. If set to True, the plugin will
+            assert that the executor's environment variables contain a `WANDB_API_KEY`
+            and that NeMo Logger's `wandb` is set. Defaults to False.
+        validate_nodes_and_devices (bool): Whether to validate the number of devices and nodes. If set to True, the plugin will assert that the task's
+            trainer is configured to use the same number of nodes and devices as the executor. Defaults to True.
+    """
+
+    validate_preemption: bool = True
+    validate_checkpoint_dir: bool = True
+    validate_serialization: bool = True
+    validate_wandb: bool = False
+    validate_nodes_and_devices: bool = True
+
+    def setup(self, task: run.Partial | run.Script, executor: run.Executor):
+        assert isinstance(task, run.Partial)
+        logging.info(f"Validating {task.__fn_or_cls__.__qualname__} and {executor.__class__.__qualname__}.")
+        if self.validate_preemption:
+            logging.info("Validating preemption callback")
+            assert any(map(lambda callback: callback.__fn_or_cls__ == PreemptionCallback, task.trainer.callbacks))
+
+        if self.validate_checkpoint_dir:
+            if isinstance(executor, run.SlurmExecutor):
+                mounts = executor.container_mounts + ["/nemo_run"]
+                mounts = list(map(lambda m: m.split(":")[-1], mounts))
+                logging.info(f"Validating checkpoint dir {task.log.log_dir} exists in {mounts}")
+                assert task.log.log_dir
+                assert any(map(lambda mount: Path(mount) in Path(task.log.log_dir).parents, mounts))
+
+        if self.validate_serialization:
+            from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer
+
+            logging.info("Validating serialization/de-serialization of task")
+            serializer = ZlibJSONSerializer()
+            assert serializer.deserialize(serializer.serialize(task)) == task
+
+        if self.validate_wandb:
+            logging.info("Validating that Weights and Biases is enabled for task")
+            assert "WANDB_API_KEY" in executor.env_vars.keys()
+            assert task.log.wandb
+
+        if self.validate_nodes_and_devices:
+            logging.info("Validating that nodes and devices match for task and executor")
+            if isinstance(executor, run.SlurmExecutor):
+                assert task.trainer.num_nodes == executor.nodes
+                assert task.trainer.devices == executor.nproc_per_node()
diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py
new file mode 100644
index 000000000000..c513aa5c044f
--- /dev/null
+++ b/tests/lightning/test_nemo_run.py
@@ -0,0 +1,131 @@
+from functools import partial
+
+import pytest
+
+BASE_CHECKPOINT_DIR = "/nemo_run/checkpoints"
+
+
+@pytest.mark.parametrize(
+    "module, recipe, name",
+    [
+        ("llama3_8b", "pretrain_recipe", "llama3_8b_pretrain"),
+        ("llama3_8b", "finetune_recipe", "llama3_8b_finetune"),
+        ("llama3_8b_16k", "pretrain_recipe", "llama3_8b_16k_pretrain"),
+        ("llama3_8b_16k", "finetune_recipe", "llama3_8b_16k_finetune"),
+        ("llama3_8b_64k", "pretrain_recipe", "llama3_8b_64k_pretrain"),
+        ("llama3_8b_64k", "finetune_recipe", "llama3_8b_64k_finetune"),
+        ("llama3_70b", "pretrain_recipe", "llama3_70b_pretrain"),
+        ("llama3_70b", "finetune_recipe", "llama3_70b_finetune"),
+        ("llama3_70b_16k", "pretrain_recipe", "llama3_70b_16k_pretrain"),
+        ("llama3_70b_16k", "finetune_recipe", "llama3_70b_16k_finetune"),
+        ("llama3_70b_64k", "pretrain_recipe", "llama3_70b_64k_pretrain"),
+        ("llama3_70b_64k", "finetune_recipe", "llama3_70b_64k_finetune"),
+        ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"),
+        ("mistral", "pretrain_recipe", "mistral_pretrain"),
+        ("mistral", "finetune_recipe", "mistral_finetune"),
+        ("mixtral_8x3b", "pretrain_recipe", "mixtral_8x3b_pretrain"),
+        ("mixtral_8x3b", "finetune_recipe", "mixtral_8x3b_finetune"),
+        ("mixtral_8x3b_16k", "pretrain_recipe", "mixtral_8x3b_16k_pretrain"),
+        ("mixtral_8x3b_16k", "finetune_recipe", "mixtral_8x3b_16k_finetune"),
+        ("mixtral_8x3b_64k", "pretrain_recipe", "mixtral_8x3b_64k_pretrain"),
+        ("mixtral_8x3b_64k", "finetune_recipe", "mixtral_8x3b_64k_finetune"),
+        ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"),
+        ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"),
+        ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"),
+        ("mixtral_8x7b_16k", "finetune_recipe", "mixtral_8x7b_16k_finetune"),
+        ("mixtral_8x7b_64k", "pretrain_recipe", "mixtral_8x7b_64k_pretrain"),
+        ("mixtral_8x7b_64k", "finetune_recipe", "mixtral_8x7b_64k_finetune"),
+        ("mixtral_8x22b", "pretrain_recipe", "mixtral_8x22b_pretrain"),
+        ("mixtral_8x22b", "finetune_recipe", "mixtral_8x22b_finetune"),
+        ("nemotron3_4b", "pretrain_recipe", "nemotron3_4b_pretrain"),
+        ("nemotron3_8b", "pretrain_recipe", "nemotron3_8b_pretrain"),
+        ("nemotron3_8b", "finetune_recipe", "nemotron3_8b_finetune"),
+        ("nemotron4_15b", "pretrain_recipe", "nemotron4_15b_pretrain"),
+        ("nemotron4_15b_16k", "pretrain_recipe", "nemotron4_15b_16k_pretrain"),
+        ("nemotron4_15b_64k", "pretrain_recipe", "nemotron4_15b_64k_pretrain"),
+        ("nemotron4_22b", "pretrain_recipe", "nemotron4_22b_pretrain"),
+        ("nemotron4_22b_16k", "pretrain_recipe", "nemotron4_22b_16k_pretrain"),
+        ("nemotron4_22b_64k", "pretrain_recipe", "nemotron4_22b_64k_pretrain"),
+        ("nemotron4_340b", "pretrain_recipe", "nemotron4_340b_pretrain"),
+        ("nemotron4_340b", "finetune_recipe", "nemotron4_340b_finetune"),
+    ],
+)
+def test_recipes_with_nemo_run(module, recipe, name, tmpdir, monkeypatch):
+    monkeypatch.setenv("NEMORUN_HOME", str(tmpdir))
+    monkeypatch.setenv("WANDB_API_KEY", "dummy")
+    import nemo_run as run
+
+    from nemo.collections import llm
+    from nemo.collections.llm.recipes.log.default import wandb_logger
+    from nemo.lightning.run import plugins
+
+    recipe_config = getattr(getattr(llm, module), recipe)(
+        name=name, dir=BASE_CHECKPOINT_DIR, num_nodes=1, num_gpus_per_node=8
+    )
+    run_plugins = [
+        plugins.PreemptionPlugin(),
+        plugins.WandbPlugin(name=name, logger_fn=partial(wandb_logger, entity="dummy", project="dummy")),
+    ]
+    validation_plugin = plugins.ConfigValidationPlugin(validate_wandb=True)
+    run_plugins.append(validation_plugin)
+
+    with run.Experiment(f"{name}-unit-test") as exp:
+        exp.add(
+            recipe_config,
+            executor=run.SlurmExecutor(
+                account="dummy",
+                partition="dummy",
+                nodes=recipe_config.trainer.num_nodes,
+                ntasks_per_node=recipe_config.trainer.devices,
+            ),
+            name=name,
+            plugins=run_plugins,
+        )
+        exp.dryrun()
+
+    with pytest.raises(AssertionError):
+        with run.Experiment(f"{name}-unit-test-fail-validate-nodes-and-devices") as exp:
+            exp.add(
+                recipe_config,
+                executor=run.SlurmExecutor(
+                    account="dummy",
+                    partition="dummy",
+                    nodes=recipe_config.trainer.num_nodes + 1,
+                    ntasks_per_node=recipe_config.trainer.devices + 1,
+                ),
+                name=name,
+                plugins=run_plugins,
+            )
+            exp.dryrun()
+
+    with pytest.raises(AssertionError):
+        cfg = recipe_config.clone()
+        cfg.log.log_dir = "/temporary-does-not-exist"
+        with run.Experiment(f"{name}-unit-test-fail-validate-checkpoint-dir") as exp:
+            exp.add(
+                cfg,
+                executor=run.SlurmExecutor(
+                    account="dummy",
+                    partition="dummy",
+                    nodes=cfg.trainer.num_nodes,
+                    ntasks_per_node=cfg.trainer.devices,
+                ),
+                name=name,
+                plugins=run_plugins,
+            )
+            exp.dryrun()
+
+    run_plugins = [plugins.NsysPlugin(start_step=3, end_step=4)] + run_plugins
+    with run.Experiment(f"{name}-nsys-unit-test") as exp:
+        exp.add(
+            recipe_config,
+            executor=run.SlurmExecutor(
+                account="dummy",
+                partition="dummy",
+                nodes=recipe_config.trainer.num_nodes,
+                ntasks_per_node=recipe_config.trainer.devices,
+            ),
+            name=name,
+            plugins=run_plugins,
+        )
+        exp.dryrun()

From c02ea1296168da2ea0e1aab179d244ceb959c2f0 Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Mon, 23 Sep 2024 17:49:35 -0400
Subject: [PATCH 03/14] Fix pps issue on nemo export (#10544)

* fix minor import bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* fix pps bug

* remove config file

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com>
---
 nemo/export/trt_llm/converter/model_converter.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index 627096168d7b..366206c948eb 100755
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -231,15 +231,12 @@ def model_to_trtllm_ckpt(
         "transformer.ln_f.bias",
     }
 
-    gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node
-
     for i in range(world_size):
         mapping = tensorrt_llm.Mapping(
             world_size=world_size,
             rank=i,
             tp_size=tensor_parallel_size,
             pp_size=pipeline_parallel_size,
-            gpus_per_node=gpus_per_node,
         )
         layers_range = mapping.pp_layers(num_layers)
 

From 53a10a78d301975809293da02842ac40a923011a Mon Sep 17 00:00:00 2001
From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com>
Date: Tue, 24 Sep 2024 00:00:38 -0400
Subject: [PATCH 04/14] fix type error in llm collection (#10552)

* fix type bugs

Signed-off-by: stevehuang52 <heh@nvidia.com>

* Update mixin.py

add type hint

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>

* Update mixin.py

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>

* Update mixin.py

Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>

---------

Signed-off-by: stevehuang52 <heh@nvidia.com>
Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com>
Signed-off-by: stevehuang52 <stevehuang52@users.noreply.github.com>
Co-authored-by: stevehuang52 <stevehuang52@users.noreply.github.com>
---
 nemo/lightning/io/connector.py | 2 +-
 nemo/lightning/io/mixin.py     | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
index 48222a4bd04d..7d81e631d6f1 100644
--- a/nemo/lightning/io/connector.py
+++ b/nemo/lightning/io/connector.py
@@ -222,7 +222,7 @@ def nemo_load(
 
     def local_path(self, base_path: Optional[Path] = None) -> Path:
         if base_path:
-            _base = base_path
+            _base = Path(base_path)
         else:
             from nemo.lightning.base import NEMO_MODELS_CACHE
 
diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
index 36fb36bfcb34..c196d17e3343 100644
--- a/nemo/lightning/io/mixin.py
+++ b/nemo/lightning/io/mixin.py
@@ -306,13 +306,15 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
         return ckpt_path
 
     @classmethod
-    def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnector:
+    def _get_connector(
+        cls, ext: Union[str, Path], path: Optional[Union[str, Path]] = None, importer: bool = True, **kwargs
+    ) -> ModelConnector:
         """
         Retrieves the appropriate model connector based on the file extension and path,
         distinguishing between importers and exporters.
 
         Args:
-            ext (str): The file extension or a URI that may include a protocol specifier.
+            ext (Union[str, Path]): The file extension or a URI that may include a protocol specifier.
             path (Optional[Union[str, Path]]): The path where the model file is located or will be saved.
             importer (bool): Flag to determine if the connector is for importing (True) or exporting (False).
 
@@ -326,10 +328,11 @@ def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnect
                         when required.
         """
         _path = None
+        ext = str(ext)
         if "://" in ext:
             ext, _path = ext.split("://")
         else:
-            _path = path
+            _path = str(path)
 
         connector = cls._IMPORTERS.get(str(cls) + ext) if importer else cls._EXPORTERS.get(str(cls) + ext)
         if not connector:

From 6023b80650b414d2444f8088d580b2c0fcde4aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 24 Sep 2024 07:31:37 +0200
Subject: [PATCH 05/14] ci: Safer sequence escaping (#10595)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .../workflows/cherry-pick-release-commit.yml  | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 949f4e5e2125..022399ec4e5b 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -70,12 +70,21 @@ jobs:
 
             if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
               PR_URL="https://github.com/NVIDIA/NeMo/pull/$PR_ID"
-              PAYLOAD='{
-                "title": "Cherry pick `'$PR_TITLE' ('$PR_ID')` into `'$RELEASE_BRANCH'`",
-                "head": "cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'",
-                "base": "'$RELEASE_BRANCH'",
-                "body": "[🤖]: Hi @'$AUTHOR' 👋,<br><br>we'"'"'ve cherry picked #'$PR_ID' into `'$RELEASE_BRANCH'` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience!"
-              }'
+
+              PAYLOAD=$(jq \
+                -n \
+                -c \
+                --arg TITLE "Cherry pick \`$PR_TITLE ($PR_ID)\` into \`$RELEASE_BRANCH\`" \
+                --arg HEAD "cherry-pick-$PR_ID-$RELEASE_BRANCH" \
+                --arg RELEASE_BRANCH "$RELEASE_BRANCH" \
+                --arg BODY "[🤖]: Hi @$AUTHOR 👋,<br><br>we've cherry picked #$PR_ID into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!" \
+                '{
+                  "title": $TITLE,
+                  "head": $HEAD,
+                  "base": $RELEASE_BRANCH,
+                  "body": $BODY
+                }'
+              )
 
               NEW_PR=$(curl -L \
                 -X POST \
@@ -85,7 +94,7 @@ jobs:
                 https://api.github.com/repos/NVIDIA/NeMo/pulls \
                 -d $PAYLOAD)
                 
-              NEW_PR_ID=$(echo -e $NEW_PR | jq '.number')
+              NEW_PR_ID=$(echo -E $NEW_PR | jq '.number')
               curl -L \
                 -X POST \
                 -H "Accept: application/vnd.github+json" \

From c4e415788f17ce22722e20bcc9cd2e0b274f0925 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 24 Sep 2024 11:16:30 +0200
Subject: [PATCH 06/14] ci: Fix issues with version bump (#10467)

* ci: Fix issues with version bump

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* fix

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

* f

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>

---------

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/release-freeze.yml | 70 ++++++++++++++++------------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml
index 7f8cd3dad8f5..2f4799cfc5e2 100644
--- a/.github/workflows/release-freeze.yml
+++ b/.github/workflows/release-freeze.yml
@@ -1,21 +1,20 @@
-name: "NeMo Code freeze"
+name: "Code freeze"
 
 on:
   workflow_dispatch:
     inputs:
-      next_version:
-        description: 'MAJOR.MINOR.PATCH[rcN] (Example: 2.0.0rc1, or 2.1.0)'
-        required: true
-        type: string
-      is_prelease:
-        description: Whether to keep and bump the pre-release label
-        required: false
-        default: false
-        type: boolean
+      type_of_release:
+        type: choice
+        description: Type of release
+        options: 
+        - major
+        - minor
+        - pre_release
       mcore_version:
         description: 'Version of MCore to use (must be a valid git ref)'
         required: true
         type: string
+
 jobs:
   create-release-branch:
     runs-on: ubuntu-latest
@@ -39,7 +38,7 @@ jobs:
         run: |
           cd ${{ github.run_id }}
                     
-          if [[ "${{ inputs.is_prelease }}" == "false" ]]; then
+          if [[ "${{ inputs.type_of_release }}" != "pre_release" ]]; then
             sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py 
           fi
 
@@ -106,33 +105,42 @@ jobs:
         id: bump-version
         run: |
           cd ${{ github.run_id }}
-          FULL_VERSION_NUM=${{ inputs.next_version }}
-          VERSION=${FULL_VERSION_NUM%%rc*}
-          MAJOR=$(echo "$VERSION" | cut -d. -f1)
-          MINOR=$(echo "$VERSION" | cut -d. -f2)
-          PATCH=$(echo "$VERSION" | cut -d. -f3)
-          PRE_RELEASE=${FULL_VERSION_NUM#$VERSION}
-          
-          sed -i 's/^MAJOR\s*=\s*[0-9]\+/MAJOR = '$MAJOR'/' $VERSION_FILE
-          sed -i 's/^MINOR\s*=\s*[0-9]\+/MINOR = '$MINOR'/' $VERSION_FILE
-          sed -i 's/^PATCH\s*=\s*[0-9]\+/PATCH = '$PATCH'/' $VERSION_FILE
-          sed -i 's/^PRE_RELEASE\s*=\s*'.*'/PRE_RELEASE = '\'$PRE_RELEASE\''/' $VERSION_FILE
-
-          cat $VERSION_FILE
-          PRE_RELEASE=$(echo $PRE_RELEASE | tr -d "'")
-          echo "version=$MAJOR.$MINOR.$PATCH$PRE_RELEASE" >> "$GITHUB_OUTPUT"
+          PRE_RELEASE=$(cat nemo/package_info.py | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
+          MAJOR=$(cat nemo/package_info.py | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
+          MINOR=$(cat nemo/package_info.py | awk '/^MINOR = /' | awk -F"= " '{print $2}')
+          PATCH=$(cat nemo/package_info.py | awk '/^PATCH = /' | awk -F"= " '{print $2}')
+
+          if [[ "${{ inputs.type_of_release }}" == "pre_release" ]]; then
+            NEXT_MAJOR=$MAJOR
+            NEXT_MINOR=$MINOR
+            NEXT_PRE_RELEASE=rc$(( $(echo $PRE_RELEASE | awk -F"rc" '{print $2}') + 1))
+          elif [[ "${{ inputs.type_of_release }}" == "major" ]]; then
+            NEXT_MAJOR=$(( MAJOR + 1))
+            NEXT_MINOR=0
+            NEXT_PRE_RELEASE=rc0
+          else
+            NEXT_MAJOR=$MAJOR
+            NEXT_MINOR=$(( MINOR + 1))
+            NEXT_PRE_RELEASE=rc0
+          fi
+
+          sed -i "/^MAJOR/c\MAJOR = $NEXT_MAJOR" nemo/package_info.py
+          sed -i "/^MINOR/c\MINOR = $NEXT_MINOR" nemo/package_info.py
+          sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '$NEXT_PRE_RELEASE'" nemo/package_info.py
+
+          echo "version=$NEXT_MAJOR.$NEXT_MINOR.$PATCH$NEXT_PRE_RELEASE" >> "$GITHUB_OUTPUT"
 
       - name: Create Version Bump PR
         uses: peter-evans/create-pull-request@v6
         id: create-pull-request
         with:
           path: ${{ github.run_id }}
-          branch: bot/chore/version-bump-${{ inputs.next_version }}
-          title: 'Version bump to `${{ inputs.next_version }}`'
+          branch: bot/chore/version-bump-${{ steps.bump-version.outputs.version }}
+          title: 'Version bump to `${{ steps.bump-version.outputs.version }}`'
           body: |
-            🚀 Version bump NeMo toolkit to `${{ inputs.next_version }}`
+            🚀 Version bump NeMo-Toolkit to `${{ steps.bump-version.outputs.version }}`
 
-          commit-message: "[🤠]: Howdy folks, let's bump NeMo `${{ inputs.next_version }}` !"
+          commit-message: "[🤠]: Howdy folks, let's bump NeMo-Toolkit `${{ steps.bump-version.outputs.version }}` !"
           signoff: true
           assignees: okoenig
           labels: 'Run CICD'
@@ -151,7 +159,7 @@ jobs:
                 "type": "section",
                 "text": {
                   "type": "mrkdwn",
-                  "text": "Releasebot 🤖: NeMo Toolkit has been frozen 🎉 to branch `r${{ needs.create-release-branch.outputs.version }}`"
+                  "text": "Releasebot 🤖: NeMo-Toolkit has been frozen 🎉 to branch `r${{ needs.create-release-branch.outputs.version }}`"
                 }
               }
             ]

From 810d07f03034f481e3eee2332683c73d2fcecb5c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Tue, 24 Sep 2024 11:36:36 +0200
Subject: [PATCH 07/14] ci: Add missing test specs (#10597)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cicd-main.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 730c363b41f3..74ae8d57b738 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -750,6 +750,7 @@ jobs:
   OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |-
@@ -2950,6 +2951,7 @@ jobs:
   L2_Megatron_GPT_Skip_Train:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3222,6 +3224,7 @@ jobs:
   L2_Megatron_GPT_with_Drop_Optimizer_States_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |

From 0fad1c15f949deef82b0f031021e30002975107b Mon Sep 17 00:00:00 2001
From: Jan Lasek <janek.lasek@gmail.com>
Date: Tue, 24 Sep 2024 16:40:00 +0200
Subject: [PATCH 08/14] Extending modelopt spec for TEDotProductAttention
 (#10523)

* Extend modelopt spec for TEDotProductAttention to support sliding window attention

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Simplify import guarding

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
---
 .../megatron/gpt_layer_modelopt_spec.py         | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index e05c61bf3d24..080984133f3c 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 try:
-    from megatron.core.extensions.transformer_engine import TENorm
+    from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
     from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
     from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-    from megatron.core.transformer.dot_product_attention import DotProductAttention
     from megatron.core.transformer.enums import AttnMaskType
     from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.mlp import MLP, MLPSubmodules
@@ -31,25 +30,23 @@
 
     from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 
-    TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults
-    MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults
-    AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults
-    ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults
-
+    ModuleSpec = ApexGuardDefaults
     HAVE_MEGATRON_CORE = False
     IMPORT_ERROR = e
 
 
 # Use this spec for Model Optimizer PTQ and TensorRT-LLM export
 def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
-    """Mix the native spec with TENorm.
+    """Mix the native spec with TENorm and TEDotProductAttention.
 
     This is essentially the native local spec except for the layernorm implementation
     is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
     prevents the apex dependency.
+
+    TEDotProductAttention is used to support sliding window attention.
     """
     if not HAVE_MEGATRON_CORE:
-        raise Exception(IMPORT_ERROR)
+        raise IMPORT_ERROR
 
     return ModuleSpec(
         module=TransformerLayer,
@@ -60,7 +57,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
                 params={"attn_mask_type": AttnMaskType.causal},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
+                    core_attention=TEDotProductAttention,
                     linear_proj=RowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,

From 849e7e02356d45fc589a162aedb437dbae854095 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <petezor@gmail.com>
Date: Tue, 24 Sep 2024 11:25:39 -0400
Subject: [PATCH 09/14] Update Multi_Task_Adapters.ipynb (#10600)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <petezor@gmail.com>
---
 tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
index 7bd36e6b6ad8..852b3e838d5c 100644
--- a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
+++ b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb
@@ -386,7 +386,7 @@
       },
       "outputs": [],
       "source": [
-        "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import get_prompt_format_fn, registered_prompt_format_fn"
+        "from nemo.collections.common.prompts.fn import get_prompt_format_fn, registered_prompt_format_fn"
       ]
     },
     {
@@ -707,7 +707,7 @@
         "from lhotse.dataset.collation import collate_vectors\n",
         "\n",
         "from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper\n",
-        "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextLhotseDataset, get_prompt_format_fn\n",
+        "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextLhotseDataset\n",
         "\n",
         "class MyCanaryPromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):\n",
         "    \"\"\"\n",

From f351f64533c2af9c7e56f7fc687f89f173183978 Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Tue, 24 Sep 2024 10:47:28 -0700
Subject: [PATCH 10/14] Change default for always_save_context to True (#10547)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
---
 nemo/lightning/pytorch/callbacks/model_checkpoint.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
index 2df4ca56d1a0..6aee365a3f60 100644
--- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py
+++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -73,7 +73,7 @@ def __init__(
         train_time_interval: Optional[timedelta] = None,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         save_optim_on_train_end: Optional[bool] = False,
-        always_save_context: bool = False,
+        always_save_context: bool = True,
         save_context_on_train_end: bool = True,
         **kwargs,
     ):

From 70bc06bce63258d2d1cd6fc9e8bf2f37ebbd65fb Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 24 Sep 2024 11:27:57 -0700
Subject: [PATCH 11/14] Import guard for SimpleMultiModalDataModule (#10592)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/collections/multimodal/data/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/multimodal/data/__init__.py b/nemo/collections/multimodal/data/__init__.py
index 01b98aecaecd..7e6ac24828f5 100644
--- a/nemo/collections/multimodal/data/__init__.py
+++ b/nemo/collections/multimodal/data/__init__.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo.utils.import_utils import safe_import_from
 
-from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
-
+SimpleMultiModalDataModule, _ = safe_import_from(
+    "nemo.collections.multimodal.data.energon", "SimpleMultiModalDataModule"
+)
 __all__ = ["SimpleMultiModalDataModule"]

From 9d5a1aab5ae872cb1fb02ed8f211c43336cf90d4 Mon Sep 17 00:00:00 2001
From: Nithin Rao <nithinrao.koluguri@gmail.com>
Date: Tue, 24 Sep 2024 14:29:12 -0400
Subject: [PATCH 12/14] add support for train_time_interval to consider hydra
 object (#10559)

Signed-off-by: Nithin Rao Koluguri <nithinraok>
Co-authored-by: Nithin Rao Koluguri <nithinraok>
---
 nemo/utils/exp_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index ca60010c6fda..543c7e0781d2 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -116,7 +116,7 @@ class CallbackParams:
     auto_insert_metric_name: bool = True
     every_n_epochs: Optional[int] = 1
     every_n_train_steps: Optional[int] = None
-    train_time_interval: Optional[str] = None
+    train_time_interval: Optional[Any] = None
     prefix: Optional[str] = None  # If None, exp_manager will attempt to handle the filepath
     postfix: str = ".nemo"
     save_best_model: bool = False
@@ -374,6 +374,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want
                 a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None.
             - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize
+            - train_time_interval (timedelta): pass an object of timedelta to save the model every timedelta. Defaults to None.
+                (use _target_ with hydra to achieve this)
 
     returns:
         log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of

From 877144a9eb299098792f57f58fec64795d31c67d Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Tue, 24 Sep 2024 13:28:23 -0700
Subject: [PATCH 13/14] Move update_config_with_dtype_overrides logging to
 debug (#10602)

* Move update_config_with_dtype_overrides logging to debug

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* update comment

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/lightning/pytorch/plugins/mixed_precision.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
index c48bbcf8c1b1..5c318b59e54a 100644
--- a/nemo/lightning/pytorch/plugins/mixed_precision.py
+++ b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -220,12 +220,12 @@ def update_config_with_dtype_overrides(dtype_config, config):
     for field in fields(dtype_config):
         if not hasattr(config, field.name):
             continue
-        # If we overwrote a value, throw a warning.
+        # If we overwrote a value, log a debug message.
         old_val = getattr(config, field.name)
         new_val = getattr(dtype_config, field.name)
         if old_val != new_val:
             setattr(config, field.name, new_val)
-            logging.warning(f"Overwrote {type(config).__name__}.{field.name}  {old_val} -> {new_val}")
+            logging.debug(f"Overwrote {type(config).__name__}.{field.name}  {old_val} -> {new_val}")
     return config
 
 

From 0ec10d2dc97a7107f4d1cc25fc453bfe1c155878 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Wed, 25 Sep 2024 13:38:51 +0200
Subject: [PATCH 14/14] ci: Wrap into quotes (#10616)

Signed-off-by: Oliver Koenig <okoenig@nvidia.com>
---
 .github/workflows/cherry-pick-release-commit.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
index 022399ec4e5b..565592d63ef4 100644
--- a/.github/workflows/cherry-pick-release-commit.yml
+++ b/.github/workflows/cherry-pick-release-commit.yml
@@ -92,7 +92,7 @@ jobs:
                 -H "Authorization: Bearer $GH_TOKEN" \
                 -H "X-GitHub-Api-Version: 2022-11-28" \
                 https://api.github.com/repos/NVIDIA/NeMo/pulls \
-                -d $PAYLOAD)
+                -d "$PAYLOAD")
                 
               NEW_PR_ID=$(echo -E $NEW_PR | jq '.number')
               curl -L \