Merge branch 'main' into nemo_tn_betterguard

NVIDIA · Sep 25, 2024 · 1cfe93d · 1cfe93d
2 parents a1b3b2a + 0ec10d2
commit 1cfe93d
Show file tree

Hide file tree

Showing 14 changed files with 284 additions and 67 deletions.
diff --git a/.github/workflows/cherry-pick-release-commit.yml b/.github/workflows/cherry-pick-release-commit.yml
@@ -31,6 +31,7 @@ jobs:
           SHA=$(git rev-list --no-merges -n 1 HEAD)
           MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
           PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' )
+          USERNAME=$(git log -n 1 --pretty=format:%ae $SHA | awk -F'@' '{print $1}')
 
           PR=$(curl -L \
             -H "Accept: application/vnd.github+json" \
@@ -69,22 +70,31 @@ jobs:
 
             if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
               PR_URL="https://github.com/NVIDIA/NeMo/pull/$PR_ID"
-              PAYLOAD='{
-                "title": "Cherry pick `'$PR_TITLE' ('$PR_ID')` into `'$RELEASE_BRANCH'`",
-                "head": "cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'",
-                "base": "'$RELEASE_BRANCH'",
-                "body": "[🤖]: Hi @'$AUTHOR' 👋,<br><br>we'"'"'ve cherry picked #'$PR_ID' into `'$RELEASE_BRANCH'` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience!"
-              }'
+
+              PAYLOAD=$(jq \
+                -n \
+                -c \
+                --arg TITLE "Cherry pick \`$PR_TITLE ($PR_ID)\` into \`$RELEASE_BRANCH\`" \
+                --arg HEAD "cherry-pick-$PR_ID-$RELEASE_BRANCH" \
+                --arg RELEASE_BRANCH "$RELEASE_BRANCH" \
+                --arg BODY "[🤖]: Hi @$AUTHOR 👋,<br><br>we've cherry picked #$PR_ID into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!" \
+                '{
+                  "title": $TITLE,
+                  "head": $HEAD,
+                  "base": $RELEASE_BRANCH,
+                  "body": $BODY
+                }'
+              )
 
               NEW_PR=$(curl -L \
                 -X POST \
                 -H "Accept: application/vnd.github+json" \
                 -H "Authorization: Bearer $GH_TOKEN" \
                 -H "X-GitHub-Api-Version: 2022-11-28" \
                 https://api.github.com/repos/NVIDIA/NeMo/pulls \
-                -d $PAYLOAD)
+                -d "$PAYLOAD")
 
-              NEW_PR_ID=$(echo -e $NEW_PR | jq '.number')
+              NEW_PR_ID=$(echo -E $NEW_PR | jq '.number')
               curl -L \
                 -X POST \
                 -H "Accept: application/vnd.github+json" \
@@ -110,7 +120,7 @@ jobs:
                     "type": "section",
                     "text": {
                       "type": "mrkdwn",
-                      "text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'$PR_ID'> failed"
+                      "text": ":alert: Cherrypick bot 🤖: Hey @'$USERNAME': Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: @okoenig"
                     }
                   }
                 ]

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -750,6 +750,7 @@ jobs:
   OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |-
@@ -2950,6 +2951,7 @@ jobs:
   L2_Megatron_GPT_Skip_Train:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3222,6 +3224,7 @@ jobs:
   L2_Megatron_GPT_with_Drop_Optimizer_States_TP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |

diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml
@@ -1,21 +1,20 @@
-name: "NeMo Code freeze"
+name: "Code freeze"
 
 on:
   workflow_dispatch:
     inputs:
-      next_version:
-        description: 'MAJOR.MINOR.PATCH[rcN] (Example: 2.0.0rc1, or 2.1.0)'
-        required: true
-        type: string
-      is_prelease:
-        description: Whether to keep and bump the pre-release label
-        required: false
-        default: false
-        type: boolean
+      type_of_release:
+        type: choice
+        description: Type of release
+        options: 
+        - major
+        - minor
+        - pre_release
       mcore_version:
         description: 'Version of MCore to use (must be a valid git ref)'
         required: true
         type: string
+
 jobs:
   create-release-branch:
     runs-on: ubuntu-latest
@@ -39,7 +38,7 @@ jobs:
         run: |
           cd ${{ github.run_id }}
                     
-          if [[ "${{ inputs.is_prelease }}" == "false" ]]; then
+          if [[ "${{ inputs.type_of_release }}" != "pre_release" ]]; then
             sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py 
           fi
 
@@ -106,33 +105,42 @@ jobs:
         id: bump-version
         run: |
           cd ${{ github.run_id }}
-          FULL_VERSION_NUM=${{ inputs.next_version }}
-          VERSION=${FULL_VERSION_NUM%%rc*}
-          MAJOR=$(echo "$VERSION" | cut -d. -f1)
-          MINOR=$(echo "$VERSION" | cut -d. -f2)
-          PATCH=$(echo "$VERSION" | cut -d. -f3)
-          PRE_RELEASE=${FULL_VERSION_NUM#$VERSION}
-          
-          sed -i 's/^MAJOR\s*=\s*[0-9]\+/MAJOR = '$MAJOR'/' $VERSION_FILE
-          sed -i 's/^MINOR\s*=\s*[0-9]\+/MINOR = '$MINOR'/' $VERSION_FILE
-          sed -i 's/^PATCH\s*=\s*[0-9]\+/PATCH = '$PATCH'/' $VERSION_FILE
-          sed -i 's/^PRE_RELEASE\s*=\s*'.*'/PRE_RELEASE = '\'$PRE_RELEASE\''/' $VERSION_FILE
-
-          cat $VERSION_FILE
-          PRE_RELEASE=$(echo $PRE_RELEASE | tr -d "'")
-          echo "version=$MAJOR.$MINOR.$PATCH$PRE_RELEASE" >> "$GITHUB_OUTPUT"
+          PRE_RELEASE=$(cat nemo/package_info.py | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
+          MAJOR=$(cat nemo/package_info.py | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
+          MINOR=$(cat nemo/package_info.py | awk '/^MINOR = /' | awk -F"= " '{print $2}')
+          PATCH=$(cat nemo/package_info.py | awk '/^PATCH = /' | awk -F"= " '{print $2}')
+
+          if [[ "${{ inputs.type_of_release }}" == "pre_release" ]]; then
+            NEXT_MAJOR=$MAJOR
+            NEXT_MINOR=$MINOR
+            NEXT_PRE_RELEASE=rc$(( $(echo $PRE_RELEASE | awk -F"rc" '{print $2}') + 1))
+          elif [[ "${{ inputs.type_of_release }}" == "major" ]]; then
+            NEXT_MAJOR=$(( MAJOR + 1))
+            NEXT_MINOR=0
+            NEXT_PRE_RELEASE=rc0
+          else
+            NEXT_MAJOR=$MAJOR
+            NEXT_MINOR=$(( MINOR + 1))
+            NEXT_PRE_RELEASE=rc0
+          fi
+
+          sed -i "/^MAJOR/c\MAJOR = $NEXT_MAJOR" nemo/package_info.py
+          sed -i "/^MINOR/c\MINOR = $NEXT_MINOR" nemo/package_info.py
+          sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '$NEXT_PRE_RELEASE'" nemo/package_info.py
+
+          echo "version=$NEXT_MAJOR.$NEXT_MINOR.$PATCH$NEXT_PRE_RELEASE" >> "$GITHUB_OUTPUT"
 
       - name: Create Version Bump PR
         uses: peter-evans/create-pull-request@v6
         id: create-pull-request
         with:
           path: ${{ github.run_id }}
-          branch: bot/chore/version-bump-${{ inputs.next_version }}
-          title: 'Version bump to `${{ inputs.next_version }}`'
+          branch: bot/chore/version-bump-${{ steps.bump-version.outputs.version }}
+          title: 'Version bump to `${{ steps.bump-version.outputs.version }}`'
           body: |
-            🚀 Version bump NeMo toolkit to `${{ inputs.next_version }}`
+            🚀 Version bump NeMo-Toolkit to `${{ steps.bump-version.outputs.version }}`
 
-          commit-message: "[🤠]: Howdy folks, let's bump NeMo `${{ inputs.next_version }}` !"
+          commit-message: "[🤠]: Howdy folks, let's bump NeMo-Toolkit `${{ steps.bump-version.outputs.version }}` !"
           signoff: true
           assignees: okoenig
           labels: 'Run CICD'
@@ -151,7 +159,7 @@ jobs:
                 "type": "section",
                 "text": {
                   "type": "mrkdwn",
-                  "text": "Releasebot 🤖: NeMo Toolkit has been frozen 🎉 to branch `r${{ needs.create-release-branch.outputs.version }}`"
+                  "text": "Releasebot 🤖: NeMo-Toolkit has been frozen 🎉 to branch `r${{ needs.create-release-branch.outputs.version }}`"
                 }
               }
             ]

diff --git a/nemo/collections/multimodal/data/__init__.py b/nemo/collections/multimodal/data/__init__.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nemo.utils.import_utils import safe_import_from
 
-from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule
-
+SimpleMultiModalDataModule, _ = safe_import_from(
+    "nemo.collections.multimodal.data.energon", "SimpleMultiModalDataModule"
+)
 __all__ = ["SimpleMultiModalDataModule"]
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -13,11 +13,10 @@
 # limitations under the License.
 
 try:
-    from megatron.core.extensions.transformer_engine import TENorm
+    from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
     from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
     from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
-    from megatron.core.transformer.dot_product_attention import DotProductAttention
     from megatron.core.transformer.enums import AttnMaskType
     from megatron.core.transformer.identity_op import IdentityOp
     from megatron.core.transformer.mlp import MLP, MLPSubmodules
@@ -31,25 +30,23 @@
 
     from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults
 
-    TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults
-    MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults
-    AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults
-    ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults
-
+    ModuleSpec = ApexGuardDefaults
     HAVE_MEGATRON_CORE = False
     IMPORT_ERROR = e
 
 
 # Use this spec for Model Optimizer PTQ and TensorRT-LLM export
 def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
-    """Mix the native spec with TENorm.
+    """Mix the native spec with TENorm and TEDotProductAttention.
 
     This is essentially the native local spec except for the layernorm implementation
     is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
     prevents the apex dependency.
+
+    TEDotProductAttention is used to support sliding window attention.
     """
     if not HAVE_MEGATRON_CORE:
-        raise Exception(IMPORT_ERROR)
+        raise IMPORT_ERROR
 
     return ModuleSpec(
         module=TransformerLayer,
@@ -60,7 +57,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
                 params={"attn_mask_type": AttnMaskType.causal},
                 submodules=SelfAttentionSubmodules(
                     linear_qkv=ColumnParallelLinear,
-                    core_attention=DotProductAttention,
+                    core_attention=TEDotProductAttention,
                     linear_proj=RowParallelLinear,
                     q_layernorm=IdentityOp,
                     k_layernorm=IdentityOp,

diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
@@ -231,15 +231,12 @@ def model_to_trtllm_ckpt(
         "transformer.ln_f.bias",
     }
 
-    gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node
-
     for i in range(world_size):
         mapping = tensorrt_llm.Mapping(
             world_size=world_size,
             rank=i,
             tp_size=tensor_parallel_size,
             pp_size=pipeline_parallel_size,
-            gpus_per_node=gpus_per_node,
         )
         layers_range = mapping.pp_layers(num_layers)
 

diff --git a/nemo/lightning/io/connector.py b/nemo/lightning/io/connector.py
@@ -222,7 +222,7 @@ def nemo_load(
 
     def local_path(self, base_path: Optional[Path] = None) -> Path:
         if base_path:
-            _base = base_path
+            _base = Path(base_path)
         else:
             from nemo.lightning.base import NEMO_MODELS_CACHE
 

diff --git a/nemo/lightning/io/mixin.py b/nemo/lightning/io/mixin.py
@@ -306,13 +306,15 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
         return ckpt_path
 
     @classmethod
-    def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnector:
+    def _get_connector(
+        cls, ext: Union[str, Path], path: Optional[Union[str, Path]] = None, importer: bool = True, **kwargs
+    ) -> ModelConnector:
         """
         Retrieves the appropriate model connector based on the file extension and path,
         distinguishing between importers and exporters.
 
         Args:
-            ext (str): The file extension or a URI that may include a protocol specifier.
+            ext (Union[str, Path]): The file extension or a URI that may include a protocol specifier.
             path (Optional[Union[str, Path]]): The path where the model file is located or will be saved.
             importer (bool): Flag to determine if the connector is for importing (True) or exporting (False).
 
@@ -326,10 +328,11 @@ def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnect
                         when required.
         """
         _path = None
+        ext = str(ext)
         if "://" in ext:
             ext, _path = ext.split("://")
         else:
-            _path = path
+            _path = str(path)
 
         connector = cls._IMPORTERS.get(str(cls) + ext) if importer else cls._EXPORTERS.get(str(cls) + ext)
         if not connector:

diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py
@@ -73,7 +73,7 @@ def __init__(
         train_time_interval: Optional[timedelta] = None,
         save_on_train_epoch_end: Optional[bool] = False,  # Save after training, not after validation
         save_optim_on_train_end: Optional[bool] = False,
-        always_save_context: bool = False,
+        always_save_context: bool = True,
         save_context_on_train_end: bool = True,
         **kwargs,
     ):

diff --git a/nemo/lightning/pytorch/plugins/mixed_precision.py b/nemo/lightning/pytorch/plugins/mixed_precision.py
@@ -220,12 +220,12 @@ def update_config_with_dtype_overrides(dtype_config, config):
     for field in fields(dtype_config):
         if not hasattr(config, field.name):
             continue
-        # If we overwrote a value, throw a warning.
+        # If we overwrote a value, log a debug message.
         old_val = getattr(config, field.name)
         new_val = getattr(dtype_config, field.name)
         if old_val != new_val:
             setattr(config, field.name, new_val)
-            logging.warning(f"Overwrote {type(config).__name__}.{field.name}  {old_val} -> {new_val}")
+            logging.debug(f"Overwrote {type(config).__name__}.{field.name}  {old_val} -> {new_val}")
     return config