From 6400bd57ac6b52a7fee069995fa488749f89014f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 23 Sep 2024 20:28:24 +0200 Subject: [PATCH 01/14] ci: Send direct alert on failed cherry-pick (#10588) * ci: Send direct alert on failed cherry-pick Signed-off-by: Oliver Koenig * also to me Signed-off-by: Oliver Koenig * f Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 48a7c4684a7e..949f4e5e2125 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -31,6 +31,7 @@ jobs: SHA=$(git rev-list --no-merges -n 1 HEAD) MESSAGE=$(git log -n 1 --pretty=format:%s $SHA) PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' ) + USERNAME=$(git log -n 1 --pretty=format:%ae $SHA | awk -F'@' '{print $1}') PR=$(curl -L \ -H "Accept: application/vnd.github+json" \ @@ -110,7 +111,7 @@ jobs: "type": "section", "text": { "type": "mrkdwn", - "text": ":alert: Cherrypick bot πŸ€–: Cherry-pick of <'$URL'|#'$PR_ID'> failed" + "text": ":alert: Cherrypick bot πŸ€–: Hey @'$USERNAME': Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: @okoenig" } } ] From 7439b13857ee00a7c134d334aab17001a034b373 Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Mon, 23 Sep 2024 15:12:28 -0500 Subject: [PATCH 02/14] Add ConfigValidation plugin to nemo.lightning.run (#10541) * Add validation plugin to nemo.lightning.run Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * Rename to ConfigValidationPlugin Signed-off-by: Hemil Desai * Add tests for each recipes with plugins Signed-off-by: Hemil Desai * Apply isort and black reformatting Signed-off-by: hemildesai * Add nsys plugin test Signed-off-by: Hemil Desai * Add more tests Signed-off-by: Hemil Desai --------- Signed-off-by: Hemil Desai Signed-off-by: hemildesai Co-authored-by: hemildesai --- nemo/lightning/run/plugins.py | 68 +++++++++++++++- tests/lightning/test_nemo_run.py | 131 +++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 tests/lightning/test_nemo_run.py diff --git a/nemo/lightning/run/plugins.py b/nemo/lightning/run/plugins.py index 18850c9d607e..61da7303d9cb 100644 --- a/nemo/lightning/run/plugins.py +++ b/nemo/lightning/run/plugins.py @@ -13,7 +13,6 @@ # limitations under the License. import copy -import logging import os from dataclasses import dataclass, field from pathlib import Path @@ -35,7 +34,7 @@ def _merge_callbacks(partial: run.Partial, callbacks: list[run.Config[Callback]]): if hasattr(partial, "trainer"): - if hasattr(partial.trainer, "callbacks"): + if hasattr(partial.trainer, "callbacks") and partial.trainer.callbacks: for callback in callbacks: if callback not in partial.trainer.callbacks: partial.trainer.callbacks.append(callback) @@ -177,3 +176,68 @@ def setup(self, task: run.Partial | run.Script, executor: run.Executor): logging.warning( f"The {self.__class__.__name__} will have no effect as WANDB_API_KEY environment variable is not set." ) + + +@dataclass(kw_only=True) +class ConfigValidationPlugin(run.Plugin): + """ + A plugin for validating a NeMo task with its executor. + + This plugin is used to ensure that the NeMo environment, task, and executor meet certain criteria. + The validation checks include preemption, checkpoint directory, + serialization, and Weights and Biases (wandb) integration. + + Attributes: + validate_preemption (bool): Whether to validate the preemption callback. If set to True, the plugin will + assert that the task has a `PreemptionCallback`. Defaults to True. + validate_checkpoint_dir (bool): Whether to validate the checkpoint directory. If set to True and the executor + is a `SlurmExecutor`, the plugin will assert that the task's log directory exists in the mounts + specified in the `SlurmExecutor`. Defaults to True. + validate_serialization (bool): Whether to validate task serialization. If set to True, the plugin will + assert that the task can be successfully serialized and deserialized using NeMo-Run's + `ZlibJSONSerializer`. Defaults to True. + validate_wandb (bool): Whether to validate Weights and Biases integration. If set to True, the plugin will + assert that the executor's environment variables contain a `WANDB_API_KEY` + and that NeMo Logger's `wandb` is set. Defaults to False. + validate_nodes_and_devices (bool): Whether to validate the number of devices and nodes. If set to True, the plugin will assert that the task's + trainer is configured to use the same number of nodes and devices as the executor. Defaults to True. + """ + + validate_preemption: bool = True + validate_checkpoint_dir: bool = True + validate_serialization: bool = True + validate_wandb: bool = False + validate_nodes_and_devices: bool = True + + def setup(self, task: run.Partial | run.Script, executor: run.Executor): + assert isinstance(task, run.Partial) + logging.info(f"Validating {task.__fn_or_cls__.__qualname__} and {executor.__class__.__qualname__}.") + if self.validate_preemption: + logging.info("Validating preemption callback") + assert any(map(lambda callback: callback.__fn_or_cls__ == PreemptionCallback, task.trainer.callbacks)) + + if self.validate_checkpoint_dir: + if isinstance(executor, run.SlurmExecutor): + mounts = executor.container_mounts + ["/nemo_run"] + mounts = list(map(lambda m: m.split(":")[-1], mounts)) + logging.info(f"Validating checkpoint dir {task.log.log_dir} exists in {mounts}") + assert task.log.log_dir + assert any(map(lambda mount: Path(mount) in Path(task.log.log_dir).parents, mounts)) + + if self.validate_serialization: + from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer + + logging.info("Validating serialization/de-serialization of task") + serializer = ZlibJSONSerializer() + assert serializer.deserialize(serializer.serialize(task)) == task + + if self.validate_wandb: + logging.info("Validating that Weights and Biases is enabled for task") + assert "WANDB_API_KEY" in executor.env_vars.keys() + assert task.log.wandb + + if self.validate_nodes_and_devices: + logging.info("Validating that nodes and devices match for task and executor") + if isinstance(executor, run.SlurmExecutor): + assert task.trainer.num_nodes == executor.nodes + assert task.trainer.devices == executor.nproc_per_node() diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py new file mode 100644 index 000000000000..c513aa5c044f --- /dev/null +++ b/tests/lightning/test_nemo_run.py @@ -0,0 +1,131 @@ +from functools import partial + +import pytest + +BASE_CHECKPOINT_DIR = "/nemo_run/checkpoints" + + +@pytest.mark.parametrize( + "module, recipe, name", + [ + ("llama3_8b", "pretrain_recipe", "llama3_8b_pretrain"), + ("llama3_8b", "finetune_recipe", "llama3_8b_finetune"), + ("llama3_8b_16k", "pretrain_recipe", "llama3_8b_16k_pretrain"), + ("llama3_8b_16k", "finetune_recipe", "llama3_8b_16k_finetune"), + ("llama3_8b_64k", "pretrain_recipe", "llama3_8b_64k_pretrain"), + ("llama3_8b_64k", "finetune_recipe", "llama3_8b_64k_finetune"), + ("llama3_70b", "pretrain_recipe", "llama3_70b_pretrain"), + ("llama3_70b", "finetune_recipe", "llama3_70b_finetune"), + ("llama3_70b_16k", "pretrain_recipe", "llama3_70b_16k_pretrain"), + ("llama3_70b_16k", "finetune_recipe", "llama3_70b_16k_finetune"), + ("llama3_70b_64k", "pretrain_recipe", "llama3_70b_64k_pretrain"), + ("llama3_70b_64k", "finetune_recipe", "llama3_70b_64k_finetune"), + ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"), + ("mistral", "pretrain_recipe", "mistral_pretrain"), + ("mistral", "finetune_recipe", "mistral_finetune"), + ("mixtral_8x3b", "pretrain_recipe", "mixtral_8x3b_pretrain"), + ("mixtral_8x3b", "finetune_recipe", "mixtral_8x3b_finetune"), + ("mixtral_8x3b_16k", "pretrain_recipe", "mixtral_8x3b_16k_pretrain"), + ("mixtral_8x3b_16k", "finetune_recipe", "mixtral_8x3b_16k_finetune"), + ("mixtral_8x3b_64k", "pretrain_recipe", "mixtral_8x3b_64k_pretrain"), + ("mixtral_8x3b_64k", "finetune_recipe", "mixtral_8x3b_64k_finetune"), + ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"), + ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"), + ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"), + ("mixtral_8x7b_16k", "finetune_recipe", "mixtral_8x7b_16k_finetune"), + ("mixtral_8x7b_64k", "pretrain_recipe", "mixtral_8x7b_64k_pretrain"), + ("mixtral_8x7b_64k", "finetune_recipe", "mixtral_8x7b_64k_finetune"), + ("mixtral_8x22b", "pretrain_recipe", "mixtral_8x22b_pretrain"), + ("mixtral_8x22b", "finetune_recipe", "mixtral_8x22b_finetune"), + ("nemotron3_4b", "pretrain_recipe", "nemotron3_4b_pretrain"), + ("nemotron3_8b", "pretrain_recipe", "nemotron3_8b_pretrain"), + ("nemotron3_8b", "finetune_recipe", "nemotron3_8b_finetune"), + ("nemotron4_15b", "pretrain_recipe", "nemotron4_15b_pretrain"), + ("nemotron4_15b_16k", "pretrain_recipe", "nemotron4_15b_16k_pretrain"), + ("nemotron4_15b_64k", "pretrain_recipe", "nemotron4_15b_64k_pretrain"), + ("nemotron4_22b", "pretrain_recipe", "nemotron4_22b_pretrain"), + ("nemotron4_22b_16k", "pretrain_recipe", "nemotron4_22b_16k_pretrain"), + ("nemotron4_22b_64k", "pretrain_recipe", "nemotron4_22b_64k_pretrain"), + ("nemotron4_340b", "pretrain_recipe", "nemotron4_340b_pretrain"), + ("nemotron4_340b", "finetune_recipe", "nemotron4_340b_finetune"), + ], +) +def test_recipes_with_nemo_run(module, recipe, name, tmpdir, monkeypatch): + monkeypatch.setenv("NEMORUN_HOME", str(tmpdir)) + monkeypatch.setenv("WANDB_API_KEY", "dummy") + import nemo_run as run + + from nemo.collections import llm + from nemo.collections.llm.recipes.log.default import wandb_logger + from nemo.lightning.run import plugins + + recipe_config = getattr(getattr(llm, module), recipe)( + name=name, dir=BASE_CHECKPOINT_DIR, num_nodes=1, num_gpus_per_node=8 + ) + run_plugins = [ + plugins.PreemptionPlugin(), + plugins.WandbPlugin(name=name, logger_fn=partial(wandb_logger, entity="dummy", project="dummy")), + ] + validation_plugin = plugins.ConfigValidationPlugin(validate_wandb=True) + run_plugins.append(validation_plugin) + + with run.Experiment(f"{name}-unit-test") as exp: + exp.add( + recipe_config, + executor=run.SlurmExecutor( + account="dummy", + partition="dummy", + nodes=recipe_config.trainer.num_nodes, + ntasks_per_node=recipe_config.trainer.devices, + ), + name=name, + plugins=run_plugins, + ) + exp.dryrun() + + with pytest.raises(AssertionError): + with run.Experiment(f"{name}-unit-test-fail-validate-nodes-and-devices") as exp: + exp.add( + recipe_config, + executor=run.SlurmExecutor( + account="dummy", + partition="dummy", + nodes=recipe_config.trainer.num_nodes + 1, + ntasks_per_node=recipe_config.trainer.devices + 1, + ), + name=name, + plugins=run_plugins, + ) + exp.dryrun() + + with pytest.raises(AssertionError): + cfg = recipe_config.clone() + cfg.log.log_dir = "/temporary-does-not-exist" + with run.Experiment(f"{name}-unit-test-fail-validate-checkpoint-dir") as exp: + exp.add( + cfg, + executor=run.SlurmExecutor( + account="dummy", + partition="dummy", + nodes=cfg.trainer.num_nodes, + ntasks_per_node=cfg.trainer.devices, + ), + name=name, + plugins=run_plugins, + ) + exp.dryrun() + + run_plugins = [plugins.NsysPlugin(start_step=3, end_step=4)] + run_plugins + with run.Experiment(f"{name}-nsys-unit-test") as exp: + exp.add( + recipe_config, + executor=run.SlurmExecutor( + account="dummy", + partition="dummy", + nodes=recipe_config.trainer.num_nodes, + ntasks_per_node=recipe_config.trainer.devices, + ), + name=name, + plugins=run_plugins, + ) + exp.dryrun() From c02ea1296168da2ea0e1aab179d244ceb959c2f0 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Mon, 23 Sep 2024 17:49:35 -0400 Subject: [PATCH 03/14] Fix pps issue on nemo export (#10544) * fix minor import bug Signed-off-by: Onur Yilmaz * fix pps bug * remove config file Signed-off-by: Onur Yilmaz --------- Signed-off-by: Onur Yilmaz Co-authored-by: Shanmugam Ramasamy <111910568+shanmugamr1992@users.noreply.github.com> --- nemo/export/trt_llm/converter/model_converter.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py index 627096168d7b..366206c948eb 100755 --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -231,15 +231,12 @@ def model_to_trtllm_ckpt( "transformer.ln_f.bias", } - gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node - for i in range(world_size): mapping = tensorrt_llm.Mapping( world_size=world_size, rank=i, tp_size=tensor_parallel_size, pp_size=pipeline_parallel_size, - gpus_per_node=gpus_per_node, ) layers_range = mapping.pp_layers(num_layers) From 53a10a78d301975809293da02842ac40a923011a Mon Sep 17 00:00:00 2001 From: "He Huang (Steve)" <105218074+stevehuang52@users.noreply.github.com> Date: Tue, 24 Sep 2024 00:00:38 -0400 Subject: [PATCH 04/14] fix type error in llm collection (#10552) * fix type bugs Signed-off-by: stevehuang52 * Update mixin.py add type hint Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Apply isort and black reformatting Signed-off-by: stevehuang52 * Update mixin.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> * Apply isort and black reformatting Signed-off-by: stevehuang52 * Update mixin.py Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> --------- Signed-off-by: stevehuang52 Signed-off-by: He Huang (Steve) <105218074+stevehuang52@users.noreply.github.com> Signed-off-by: stevehuang52 Co-authored-by: stevehuang52 --- nemo/lightning/io/connector.py | 2 +- nemo/lightning/io/mixin.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py index 48222a4bd04d..7d81e631d6f1 100644 --- a/nemo/lightning/io/connector.py +++ b/nemo/lightning/io/connector.py @@ -222,7 +222,7 @@ def nemo_load( def local_path(self, base_path: Optional[Path] = None) -> Path: if base_path: - _base = base_path + _base = Path(base_path) else: from nemo.lightning.base import NEMO_MODELS_CACHE diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py index 36fb36bfcb34..c196d17e3343 100644 --- a/nemo/lightning/io/mixin.py +++ b/nemo/lightning/io/mixin.py @@ -306,13 +306,15 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa return ckpt_path @classmethod - def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnector: + def _get_connector( + cls, ext: Union[str, Path], path: Optional[Union[str, Path]] = None, importer: bool = True, **kwargs + ) -> ModelConnector: """ Retrieves the appropriate model connector based on the file extension and path, distinguishing between importers and exporters. Args: - ext (str): The file extension or a URI that may include a protocol specifier. + ext (Union[str, Path]): The file extension or a URI that may include a protocol specifier. path (Optional[Union[str, Path]]): The path where the model file is located or will be saved. importer (bool): Flag to determine if the connector is for importing (True) or exporting (False). @@ -326,10 +328,11 @@ def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnect when required. """ _path = None + ext = str(ext) if "://" in ext: ext, _path = ext.split("://") else: - _path = path + _path = str(path) connector = cls._IMPORTERS.get(str(cls) + ext) if importer else cls._EXPORTERS.get(str(cls) + ext) if not connector: From 6023b80650b414d2444f8088d580b2c0fcde4aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 24 Sep 2024 07:31:37 +0200 Subject: [PATCH 05/14] ci: Safer sequence escaping (#10595) Signed-off-by: Oliver Koenig --- .../workflows/cherry-pick-release-commit.yml | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 949f4e5e2125..022399ec4e5b 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -70,12 +70,21 @@ jobs: if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then PR_URL="https://github.com/NVIDIA/NeMo/pull/$PR_ID" - PAYLOAD='{ - "title": "Cherry pick `'$PR_TITLE' ('$PR_ID')` into `'$RELEASE_BRANCH'`", - "head": "cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'", - "base": "'$RELEASE_BRANCH'", - "body": "[πŸ€–]: Hi @'$AUTHOR' πŸ‘‹,

we'"'"'ve cherry picked #'$PR_ID' into `'$RELEASE_BRANCH'` for you! πŸš€

Please review and approve this cherry pick by your convenience!" - }' + + PAYLOAD=$(jq \ + -n \ + -c \ + --arg TITLE "Cherry pick \`$PR_TITLE ($PR_ID)\` into \`$RELEASE_BRANCH\`" \ + --arg HEAD "cherry-pick-$PR_ID-$RELEASE_BRANCH" \ + --arg RELEASE_BRANCH "$RELEASE_BRANCH" \ + --arg BODY "[πŸ€–]: Hi @$AUTHOR πŸ‘‹,

we've cherry picked #$PR_ID into \`$RELEASE_BRANCH\` for you! πŸš€

Please review and approve this cherry pick by your convenience\!" \ + '{ + "title": $TITLE, + "head": $HEAD, + "base": $RELEASE_BRANCH, + "body": $BODY + }' + ) NEW_PR=$(curl -L \ -X POST \ @@ -85,7 +94,7 @@ jobs: https://api.github.com/repos/NVIDIA/NeMo/pulls \ -d $PAYLOAD) - NEW_PR_ID=$(echo -e $NEW_PR | jq '.number') + NEW_PR_ID=$(echo -E $NEW_PR | jq '.number') curl -L \ -X POST \ -H "Accept: application/vnd.github+json" \ From c4e415788f17ce22722e20bcc9cd2e0b274f0925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 24 Sep 2024 11:16:30 +0200 Subject: [PATCH 06/14] ci: Fix issues with version bump (#10467) * ci: Fix issues with version bump Signed-off-by: Oliver Koenig * fix Signed-off-by: Oliver Koenig * f Signed-off-by: Oliver Koenig * f Signed-off-by: Oliver Koenig --------- Signed-off-by: Oliver Koenig --- .github/workflows/release-freeze.yml | 70 ++++++++++++++++------------ 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 7f8cd3dad8f5..2f4799cfc5e2 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -1,21 +1,20 @@ -name: "NeMo Code freeze" +name: "Code freeze" on: workflow_dispatch: inputs: - next_version: - description: 'MAJOR.MINOR.PATCH[rcN] (Example: 2.0.0rc1, or 2.1.0)' - required: true - type: string - is_prelease: - description: Whether to keep and bump the pre-release label - required: false - default: false - type: boolean + type_of_release: + type: choice + description: Type of release + options: + - major + - minor + - pre_release mcore_version: description: 'Version of MCore to use (must be a valid git ref)' required: true type: string + jobs: create-release-branch: runs-on: ubuntu-latest @@ -39,7 +38,7 @@ jobs: run: | cd ${{ github.run_id }} - if [[ "${{ inputs.is_prelease }}" == "false" ]]; then + if [[ "${{ inputs.type_of_release }}" != "pre_release" ]]; then sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py fi @@ -106,33 +105,42 @@ jobs: id: bump-version run: | cd ${{ github.run_id }} - FULL_VERSION_NUM=${{ inputs.next_version }} - VERSION=${FULL_VERSION_NUM%%rc*} - MAJOR=$(echo "$VERSION" | cut -d. -f1) - MINOR=$(echo "$VERSION" | cut -d. -f2) - PATCH=$(echo "$VERSION" | cut -d. -f3) - PRE_RELEASE=${FULL_VERSION_NUM#$VERSION} - - sed -i 's/^MAJOR\s*=\s*[0-9]\+/MAJOR = '$MAJOR'/' $VERSION_FILE - sed -i 's/^MINOR\s*=\s*[0-9]\+/MINOR = '$MINOR'/' $VERSION_FILE - sed -i 's/^PATCH\s*=\s*[0-9]\+/PATCH = '$PATCH'/' $VERSION_FILE - sed -i 's/^PRE_RELEASE\s*=\s*'.*'/PRE_RELEASE = '\'$PRE_RELEASE\''/' $VERSION_FILE - - cat $VERSION_FILE - PRE_RELEASE=$(echo $PRE_RELEASE | tr -d "'") - echo "version=$MAJOR.$MINOR.$PATCH$PRE_RELEASE" >> "$GITHUB_OUTPUT" + PRE_RELEASE=$(cat nemo/package_info.py | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'") + MAJOR=$(cat nemo/package_info.py | awk '/^MAJOR = /' | awk -F"= " '{print $2}') + MINOR=$(cat nemo/package_info.py | awk '/^MINOR = /' | awk -F"= " '{print $2}') + PATCH=$(cat nemo/package_info.py | awk '/^PATCH = /' | awk -F"= " '{print $2}') + + if [[ "${{ inputs.type_of_release }}" == "pre_release" ]]; then + NEXT_MAJOR=$MAJOR + NEXT_MINOR=$MINOR + NEXT_PRE_RELEASE=rc$(( $(echo $PRE_RELEASE | awk -F"rc" '{print $2}') + 1)) + elif [[ "${{ inputs.type_of_release }}" == "major" ]]; then + NEXT_MAJOR=$(( MAJOR + 1)) + NEXT_MINOR=0 + NEXT_PRE_RELEASE=rc0 + else + NEXT_MAJOR=$MAJOR + NEXT_MINOR=$(( MINOR + 1)) + NEXT_PRE_RELEASE=rc0 + fi + + sed -i "/^MAJOR/c\MAJOR = $NEXT_MAJOR" nemo/package_info.py + sed -i "/^MINOR/c\MINOR = $NEXT_MINOR" nemo/package_info.py + sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '$NEXT_PRE_RELEASE'" nemo/package_info.py + + echo "version=$NEXT_MAJOR.$NEXT_MINOR.$PATCH$NEXT_PRE_RELEASE" >> "$GITHUB_OUTPUT" - name: Create Version Bump PR uses: peter-evans/create-pull-request@v6 id: create-pull-request with: path: ${{ github.run_id }} - branch: bot/chore/version-bump-${{ inputs.next_version }} - title: 'Version bump to `${{ inputs.next_version }}`' + branch: bot/chore/version-bump-${{ steps.bump-version.outputs.version }} + title: 'Version bump to `${{ steps.bump-version.outputs.version }}`' body: | - πŸš€ Version bump NeMo toolkit to `${{ inputs.next_version }}` + πŸš€ Version bump NeMo-Toolkit to `${{ steps.bump-version.outputs.version }}` - commit-message: "[🀠]: Howdy folks, let's bump NeMo `${{ inputs.next_version }}` !" + commit-message: "[🀠]: Howdy folks, let's bump NeMo-Toolkit `${{ steps.bump-version.outputs.version }}` !" signoff: true assignees: okoenig labels: 'Run CICD' @@ -151,7 +159,7 @@ jobs: "type": "section", "text": { "type": "mrkdwn", - "text": "Releasebot πŸ€–: NeMo Toolkit has been frozen πŸŽ‰ to branch `r${{ needs.create-release-branch.outputs.version }}`" + "text": "Releasebot πŸ€–: NeMo-Toolkit has been frozen πŸŽ‰ to branch `r${{ needs.create-release-branch.outputs.version }}`" } } ] From 810d07f03034f481e3eee2332683c73d2fcecb5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Tue, 24 Sep 2024 11:36:36 +0200 Subject: [PATCH 07/14] ci: Add missing test specs (#10597) Signed-off-by: Oliver Koenig --- .github/workflows/cicd-main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 730c363b41f3..74ae8d57b738 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -750,6 +750,7 @@ jobs: OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure-gpus-1 SCRIPT: |- @@ -2950,6 +2951,7 @@ jobs: L2_Megatron_GPT_Skip_Train: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | @@ -3222,6 +3224,7 @@ jobs: L2_Megatron_GPT_with_Drop_Optimizer_States_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | From 0fad1c15f949deef82b0f031021e30002975107b Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 24 Sep 2024 16:40:00 +0200 Subject: [PATCH 08/14] Extending modelopt spec for TEDotProductAttention (#10523) * Extend modelopt spec for TEDotProductAttention to support sliding window attention Signed-off-by: Jan Lasek * Simplify import guarding Signed-off-by: Jan Lasek --------- Signed-off-by: Jan Lasek --- .../megatron/gpt_layer_modelopt_spec.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py index e05c61bf3d24..080984133f3c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py @@ -13,11 +13,10 @@ # limitations under the License. try: - from megatron.core.extensions.transformer_engine import TENorm + from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules - from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp from megatron.core.transformer.mlp import MLP, MLPSubmodules @@ -31,25 +30,23 @@ from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults - TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults - MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults - AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults - ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults - + ModuleSpec = ApexGuardDefaults HAVE_MEGATRON_CORE = False IMPORT_ERROR = e # Use this spec for Model Optimizer PTQ and TensorRT-LLM export def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: - """Mix the native spec with TENorm. + """Mix the native spec with TENorm and TEDotProductAttention. This is essentially the native local spec except for the layernorm implementation is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and prevents the apex dependency. + + TEDotProductAttention is used to support sliding window attention. """ if not HAVE_MEGATRON_CORE: - raise Exception(IMPORT_ERROR) + raise IMPORT_ERROR return ModuleSpec( module=TransformerLayer, @@ -60,7 +57,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec: params={"attn_mask_type": AttnMaskType.causal}, submodules=SelfAttentionSubmodules( linear_qkv=ColumnParallelLinear, - core_attention=DotProductAttention, + core_attention=TEDotProductAttention, linear_proj=RowParallelLinear, q_layernorm=IdentityOp, k_layernorm=IdentityOp, From 849e7e02356d45fc589a162aedb437dbae854095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20=C5=BBelasko?= Date: Tue, 24 Sep 2024 11:25:39 -0400 Subject: [PATCH 09/14] Update Multi_Task_Adapters.ipynb (#10600) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Ε»elasko --- tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb index 7bd36e6b6ad8..852b3e838d5c 100644 --- a/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb +++ b/tutorials/asr/asr_adapters/Multi_Task_Adapters.ipynb @@ -386,7 +386,7 @@ }, "outputs": [], "source": [ - "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import get_prompt_format_fn, registered_prompt_format_fn" + "from nemo.collections.common.prompts.fn import get_prompt_format_fn, registered_prompt_format_fn" ] }, { @@ -707,7 +707,7 @@ "from lhotse.dataset.collation import collate_vectors\n", "\n", "from nemo.collections.asr.data.audio_to_text_lhotse import TokenizerWrapper\n", - "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextLhotseDataset, get_prompt_format_fn\n", + "from nemo.collections.asr.data.audio_to_text_lhotse_prompted import PromptedAudioToTextLhotseDataset\n", "\n", "class MyCanaryPromptedAudioToTextLhotseDataset(torch.utils.data.Dataset):\n", " \"\"\"\n", From f351f64533c2af9c7e56f7fc687f89f173183978 Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Tue, 24 Sep 2024 10:47:28 -0700 Subject: [PATCH 10/14] Change default for always_save_context to True (#10547) Signed-off-by: Abhishree --- nemo/lightning/pytorch/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py index 2df4ca56d1a0..6aee365a3f60 100644 --- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py @@ -73,7 +73,7 @@ def __init__( train_time_interval: Optional[timedelta] = None, save_on_train_epoch_end: Optional[bool] = False, # Save after training, not after validation save_optim_on_train_end: Optional[bool] = False, - always_save_context: bool = False, + always_save_context: bool = True, save_context_on_train_end: bool = True, **kwargs, ): From 70bc06bce63258d2d1cd6fc9e8bf2f37ebbd65fb Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 24 Sep 2024 11:27:57 -0700 Subject: [PATCH 11/14] Import guard for SimpleMultiModalDataModule (#10592) Signed-off-by: Alexandros Koumparoulis --- nemo/collections/multimodal/data/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/collections/multimodal/data/__init__.py b/nemo/collections/multimodal/data/__init__.py index 01b98aecaecd..7e6ac24828f5 100644 --- a/nemo/collections/multimodal/data/__init__.py +++ b/nemo/collections/multimodal/data/__init__.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from nemo.utils.import_utils import safe_import_from -from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule - +SimpleMultiModalDataModule, _ = safe_import_from( + "nemo.collections.multimodal.data.energon", "SimpleMultiModalDataModule" +) __all__ = ["SimpleMultiModalDataModule"] From 9d5a1aab5ae872cb1fb02ed8f211c43336cf90d4 Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Tue, 24 Sep 2024 14:29:12 -0400 Subject: [PATCH 12/14] add support for train_time_interval to consider hydra object (#10559) Signed-off-by: Nithin Rao Koluguri Co-authored-by: Nithin Rao Koluguri --- nemo/utils/exp_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index ca60010c6fda..543c7e0781d2 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -116,7 +116,7 @@ class CallbackParams: auto_insert_metric_name: bool = True every_n_epochs: Optional[int] = 1 every_n_train_steps: Optional[int] = None - train_time_interval: Optional[str] = None + train_time_interval: Optional[Any] = None prefix: Optional[str] = None # If None, exp_manager will attempt to handle the filepath postfix: str = ".nemo" save_best_model: bool = False @@ -374,6 +374,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo - max_time (str): The maximum wall clock time *per run*. This is intended to be used on clusters where you want a checkpoint to be saved after this specified time and be able to resume from that checkpoint. Defaults to None. - seconds_to_sleep (float): seconds to sleep non rank 0 processes for. Used to give enough time for rank 0 to initialize + - train_time_interval (timedelta): pass an object of timedelta to save the model every timedelta. Defaults to None. + (use _target_ with hydra to achieve this) returns: log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of From 877144a9eb299098792f57f58fec64795d31c67d Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Tue, 24 Sep 2024 13:28:23 -0700 Subject: [PATCH 13/14] Move update_config_with_dtype_overrides logging to debug (#10602) * Move update_config_with_dtype_overrides logging to debug Signed-off-by: Alexandros Koumparoulis * update comment Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis --- nemo/lightning/pytorch/plugins/mixed_precision.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py index c48bbcf8c1b1..5c318b59e54a 100644 --- a/nemo/lightning/pytorch/plugins/mixed_precision.py +++ b/nemo/lightning/pytorch/plugins/mixed_precision.py @@ -220,12 +220,12 @@ def update_config_with_dtype_overrides(dtype_config, config): for field in fields(dtype_config): if not hasattr(config, field.name): continue - # If we overwrote a value, throw a warning. + # If we overwrote a value, log a debug message. old_val = getattr(config, field.name) new_val = getattr(dtype_config, field.name) if old_val != new_val: setattr(config, field.name, new_val) - logging.warning(f"Overwrote {type(config).__name__}.{field.name} {old_val} -> {new_val}") + logging.debug(f"Overwrote {type(config).__name__}.{field.name} {old_val} -> {new_val}") return config From 0ec10d2dc97a7107f4d1cc25fc453bfe1c155878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 25 Sep 2024 13:38:51 +0200 Subject: [PATCH 14/14] ci: Wrap into quotes (#10616) Signed-off-by: Oliver Koenig --- .github/workflows/cherry-pick-release-commit.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml index 022399ec4e5b..565592d63ef4 100644 --- a/.github/workflows/cherry-pick-release-commit.yml +++ b/.github/workflows/cherry-pick-release-commit.yml @@ -92,7 +92,7 @@ jobs: -H "Authorization: Bearer $GH_TOKEN" \ -H "X-GitHub-Api-Version: 2022-11-28" \ https://api.github.com/repos/NVIDIA/NeMo/pulls \ - -d $PAYLOAD) + -d "$PAYLOAD") NEW_PR_ID=$(echo -E $NEW_PR | jq '.number') curl -L \