Skip to content

Commit

Permalink
Merge branch 'main' into nemo_tn_betterguard
Browse files Browse the repository at this point in the history
  • Loading branch information
blisc committed Sep 25, 2024
2 parents a1b3b2a + 0ec10d2 commit 1cfe93d
Show file tree
Hide file tree
Showing 14 changed files with 284 additions and 67 deletions.
28 changes: 19 additions & 9 deletions .github/workflows/cherry-pick-release-commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ jobs:
SHA=$(git rev-list --no-merges -n 1 HEAD)
MESSAGE=$(git log -n 1 --pretty=format:%s $SHA)
PR_ID=$(echo $MESSAGE | awk -F'#' '{print $2}' | awk -F')' '{print $1}' )
USERNAME=$(git log -n 1 --pretty=format:%ae $SHA | awk -F'@' '{print $1}')
PR=$(curl -L \
-H "Accept: application/vnd.github+json" \
Expand Down Expand Up @@ -69,22 +70,31 @@ jobs:

if [[ $CHERRYPICK_SUCCESSFUL -eq 0 ]]; then
PR_URL="https://github.com/NVIDIA/NeMo/pull/$PR_ID"
PAYLOAD='{
"title": "Cherry pick `'$PR_TITLE' ('$PR_ID')` into `'$RELEASE_BRANCH'`",
"head": "cherry-pick-'$PR_ID'-'$RELEASE_BRANCH'",
"base": "'$RELEASE_BRANCH'",
"body": "[🤖]: Hi @'$AUTHOR' 👋,<br><br>we'"'"'ve cherry picked #'$PR_ID' into `'$RELEASE_BRANCH'` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience!"
}'

PAYLOAD=$(jq \
-n \
-c \
--arg TITLE "Cherry pick \`$PR_TITLE ($PR_ID)\` into \`$RELEASE_BRANCH\`" \
--arg HEAD "cherry-pick-$PR_ID-$RELEASE_BRANCH" \
--arg RELEASE_BRANCH "$RELEASE_BRANCH" \
--arg BODY "[🤖]: Hi @$AUTHOR 👋,<br><br>we've cherry picked #$PR_ID into \`$RELEASE_BRANCH\` for you! 🚀<br><br>Please review and approve this cherry pick by your convenience\!" \
'{
"title": $TITLE,
"head": $HEAD,
"base": $RELEASE_BRANCH,
"body": $BODY
}'
)

NEW_PR=$(curl -L \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer $GH_TOKEN" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/NVIDIA/NeMo/pulls \
-d $PAYLOAD)
-d "$PAYLOAD")

NEW_PR_ID=$(echo -e $NEW_PR | jq '.number')
NEW_PR_ID=$(echo -E $NEW_PR | jq '.number')
curl -L \
-X POST \
-H "Accept: application/vnd.github+json" \
Expand All @@ -110,7 +120,7 @@ jobs:
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|#'$PR_ID'> failed"
"text": ":alert: Cherrypick bot 🤖: Hey @'$USERNAME': Cherry-pick of <'$URL'|#'$PR_ID'> failed (3-way merge impossible). Please resolve manually and create a PR.\n\ncc: @okoenig"
}
}
]
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,7 @@ jobs:
OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_ASR_dev_run_Speech_To_Text_HF_Finetuning') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure-gpus-1
SCRIPT: |-
Expand Down Expand Up @@ -2950,6 +2951,7 @@ jobs:
L2_Megatron_GPT_Skip_Train:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_Skip_Train') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
Expand Down Expand Up @@ -3222,6 +3224,7 @@ jobs:
L2_Megatron_GPT_with_Drop_Optimizer_States_TP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_GPT_with_Drop_Optimizer_States_TP2') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
Expand Down
70 changes: 39 additions & 31 deletions .github/workflows/release-freeze.yml
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
name: "NeMo Code freeze"
name: "Code freeze"

on:
workflow_dispatch:
inputs:
next_version:
description: 'MAJOR.MINOR.PATCH[rcN] (Example: 2.0.0rc1, or 2.1.0)'
required: true
type: string
is_prelease:
description: Whether to keep and bump the pre-release label
required: false
default: false
type: boolean
type_of_release:
type: choice
description: Type of release
options:
- major
- minor
- pre_release
mcore_version:
description: 'Version of MCore to use (must be a valid git ref)'
required: true
type: string

jobs:
create-release-branch:
runs-on: ubuntu-latest
Expand All @@ -39,7 +38,7 @@ jobs:
run: |
cd ${{ github.run_id }}
if [[ "${{ inputs.is_prelease }}" == "false" ]]; then
if [[ "${{ inputs.type_of_release }}" != "pre_release" ]]; then
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = ''" nemo/package_info.py
fi
Expand Down Expand Up @@ -106,33 +105,42 @@ jobs:
id: bump-version
run: |
cd ${{ github.run_id }}
FULL_VERSION_NUM=${{ inputs.next_version }}
VERSION=${FULL_VERSION_NUM%%rc*}
MAJOR=$(echo "$VERSION" | cut -d. -f1)
MINOR=$(echo "$VERSION" | cut -d. -f2)
PATCH=$(echo "$VERSION" | cut -d. -f3)
PRE_RELEASE=${FULL_VERSION_NUM#$VERSION}
sed -i 's/^MAJOR\s*=\s*[0-9]\+/MAJOR = '$MAJOR'/' $VERSION_FILE
sed -i 's/^MINOR\s*=\s*[0-9]\+/MINOR = '$MINOR'/' $VERSION_FILE
sed -i 's/^PATCH\s*=\s*[0-9]\+/PATCH = '$PATCH'/' $VERSION_FILE
sed -i 's/^PRE_RELEASE\s*=\s*'.*'/PRE_RELEASE = '\'$PRE_RELEASE\''/' $VERSION_FILE
cat $VERSION_FILE
PRE_RELEASE=$(echo $PRE_RELEASE | tr -d "'")
echo "version=$MAJOR.$MINOR.$PATCH$PRE_RELEASE" >> "$GITHUB_OUTPUT"
PRE_RELEASE=$(cat nemo/package_info.py | awk '/^PRE_RELEASE = /' | awk -F"= " '{print $2}' | tr -d '"' | tr -d "'")
MAJOR=$(cat nemo/package_info.py | awk '/^MAJOR = /' | awk -F"= " '{print $2}')
MINOR=$(cat nemo/package_info.py | awk '/^MINOR = /' | awk -F"= " '{print $2}')
PATCH=$(cat nemo/package_info.py | awk '/^PATCH = /' | awk -F"= " '{print $2}')
if [[ "${{ inputs.type_of_release }}" == "pre_release" ]]; then
NEXT_MAJOR=$MAJOR
NEXT_MINOR=$MINOR
NEXT_PRE_RELEASE=rc$(( $(echo $PRE_RELEASE | awk -F"rc" '{print $2}') + 1))
elif [[ "${{ inputs.type_of_release }}" == "major" ]]; then
NEXT_MAJOR=$(( MAJOR + 1))
NEXT_MINOR=0
NEXT_PRE_RELEASE=rc0
else
NEXT_MAJOR=$MAJOR
NEXT_MINOR=$(( MINOR + 1))
NEXT_PRE_RELEASE=rc0
fi
sed -i "/^MAJOR/c\MAJOR = $NEXT_MAJOR" nemo/package_info.py
sed -i "/^MINOR/c\MINOR = $NEXT_MINOR" nemo/package_info.py
sed -i "/^PRE_RELEASE/c\PRE_RELEASE = '$NEXT_PRE_RELEASE'" nemo/package_info.py
echo "version=$NEXT_MAJOR.$NEXT_MINOR.$PATCH$NEXT_PRE_RELEASE" >> "$GITHUB_OUTPUT"
- name: Create Version Bump PR
uses: peter-evans/create-pull-request@v6
id: create-pull-request
with:
path: ${{ github.run_id }}
branch: bot/chore/version-bump-${{ inputs.next_version }}
title: 'Version bump to `${{ inputs.next_version }}`'
branch: bot/chore/version-bump-${{ steps.bump-version.outputs.version }}
title: 'Version bump to `${{ steps.bump-version.outputs.version }}`'
body: |
🚀 Version bump NeMo toolkit to `${{ inputs.next_version }}`
🚀 Version bump NeMo-Toolkit to `${{ steps.bump-version.outputs.version }}`
commit-message: "[🤠]: Howdy folks, let's bump NeMo `${{ inputs.next_version }}` !"
commit-message: "[🤠]: Howdy folks, let's bump NeMo-Toolkit `${{ steps.bump-version.outputs.version }}` !"
signoff: true
assignees: okoenig
labels: 'Run CICD'
Expand All @@ -151,7 +159,7 @@ jobs:
"type": "section",
"text": {
"type": "mrkdwn",
"text": "Releasebot 🤖: NeMo Toolkit has been frozen 🎉 to branch `r${{ needs.create-release-branch.outputs.version }}`"
"text": "Releasebot 🤖: NeMo-Toolkit has been frozen 🎉 to branch `r${{ needs.create-release-branch.outputs.version }}`"
}
}
]
Expand Down
6 changes: 4 additions & 2 deletions nemo/collections/multimodal/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from nemo.utils.import_utils import safe_import_from

from nemo.collections.multimodal.data.energon import SimpleMultiModalDataModule

SimpleMultiModalDataModule, _ = safe_import_from(
"nemo.collections.multimodal.data.energon", "SimpleMultiModalDataModule"
)
__all__ = ["SimpleMultiModalDataModule"]
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@
# limitations under the License.

try:
from megatron.core.extensions.transformer_engine import TENorm
from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
Expand All @@ -31,25 +30,23 @@

from nemo.collections.nlp.modules.common.megatron.utils import ApexGuardDefaults

TransformerLayer = TransformerLayerSubmodules = ApexGuardDefaults
MLP = MLPSubmodules = ModuleSpec = IdentityOp = ApexGuardDefaults
AttnMaskType = DotProductAttention = TENorm = ApexGuardDefaults
ColumnParallelLinear = RowParallelLinear = SelfAttention = SelfAttentionSubmodules = ApexGuardDefaults

ModuleSpec = ApexGuardDefaults
HAVE_MEGATRON_CORE = False
IMPORT_ERROR = e


# Use this spec for Model Optimizer PTQ and TensorRT-LLM export
def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
"""Mix the native spec with TENorm.
"""Mix the native spec with TENorm and TEDotProductAttention.
This is essentially the native local spec except for the layernorm implementation
is using TENorm from Transformer-Engine. This TENorm supports both FusedLayerNorm and RMSNorm and
prevents the apex dependency.
TEDotProductAttention is used to support sliding window attention.
"""
if not HAVE_MEGATRON_CORE:
raise Exception(IMPORT_ERROR)
raise IMPORT_ERROR

return ModuleSpec(
module=TransformerLayer,
Expand All @@ -60,7 +57,7 @@ def get_gpt_layer_modelopt_spec(num_experts: int = None) -> ModuleSpec:
params={"attn_mask_type": AttnMaskType.causal},
submodules=SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=DotProductAttention,
core_attention=TEDotProductAttention,
linear_proj=RowParallelLinear,
q_layernorm=IdentityOp,
k_layernorm=IdentityOp,
Expand Down
3 changes: 0 additions & 3 deletions nemo/export/trt_llm/converter/model_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,15 +231,12 @@ def model_to_trtllm_ckpt(
"transformer.ln_f.bias",
}

gpus_per_node = tensor_parallel_size if gpus_per_node is None else gpus_per_node

for i in range(world_size):
mapping = tensorrt_llm.Mapping(
world_size=world_size,
rank=i,
tp_size=tensor_parallel_size,
pp_size=pipeline_parallel_size,
gpus_per_node=gpus_per_node,
)
layers_range = mapping.pp_layers(num_layers)

Expand Down
2 changes: 1 addition & 1 deletion nemo/lightning/io/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def nemo_load(

def local_path(self, base_path: Optional[Path] = None) -> Path:
if base_path:
_base = base_path
_base = Path(base_path)
else:
from nemo.lightning.base import NEMO_MODELS_CACHE

Expand Down
9 changes: 6 additions & 3 deletions nemo/lightning/io/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,13 +306,15 @@ def import_ckpt(self, path: str, overwrite: bool = False, base_path: Optional[Pa
return ckpt_path

@classmethod
def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnector:
def _get_connector(
cls, ext: Union[str, Path], path: Optional[Union[str, Path]] = None, importer: bool = True, **kwargs
) -> ModelConnector:
"""
Retrieves the appropriate model connector based on the file extension and path,
distinguishing between importers and exporters.
Args:
ext (str): The file extension or a URI that may include a protocol specifier.
ext (Union[str, Path]): The file extension or a URI that may include a protocol specifier.
path (Optional[Union[str, Path]]): The path where the model file is located or will be saved.
importer (bool): Flag to determine if the connector is for importing (True) or exporting (False).
Expand All @@ -326,10 +328,11 @@ def _get_connector(cls, ext, path=None, importer=True, **kwargs) -> ModelConnect
when required.
"""
_path = None
ext = str(ext)
if "://" in ext:
ext, _path = ext.split("://")
else:
_path = path
_path = str(path)

connector = cls._IMPORTERS.get(str(cls) + ext) if importer else cls._EXPORTERS.get(str(cls) + ext)
if not connector:
Expand Down
2 changes: 1 addition & 1 deletion nemo/lightning/pytorch/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def __init__(
train_time_interval: Optional[timedelta] = None,
save_on_train_epoch_end: Optional[bool] = False, # Save after training, not after validation
save_optim_on_train_end: Optional[bool] = False,
always_save_context: bool = False,
always_save_context: bool = True,
save_context_on_train_end: bool = True,
**kwargs,
):
Expand Down
4 changes: 2 additions & 2 deletions nemo/lightning/pytorch/plugins/mixed_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,12 +220,12 @@ def update_config_with_dtype_overrides(dtype_config, config):
for field in fields(dtype_config):
if not hasattr(config, field.name):
continue
# If we overwrote a value, throw a warning.
# If we overwrote a value, log a debug message.
old_val = getattr(config, field.name)
new_val = getattr(dtype_config, field.name)
if old_val != new_val:
setattr(config, field.name, new_val)
logging.warning(f"Overwrote {type(config).__name__}.{field.name} {old_val} -> {new_val}")
logging.debug(f"Overwrote {type(config).__name__}.{field.name} {old_val} -> {new_val}")
return config


Expand Down
Loading

0 comments on commit 1cfe93d

Please sign in to comment.