Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mixtral recipe reg tests on a3ultra #511

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 176 additions & 0 deletions dags/map_reproducibility/a3mega_gpt3_175b_nemo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""DAGs to run Aotc reproducibility benchmarks."""

import datetime
import sys
import os
import tempfile

from airflow import models
from airflow.decorators import task
from airflow.hooks.subprocess import SubprocessHook
from dags import composer_env
from dags.map_reproducibility.utils.common_utils import get_metrics_cmds
# from dags.map_reproducibility.utils.common_utils import set_variables_cmds
from dags.map_reproducibility.utils.common_utils import configure_project_and_cluster
from dags.map_reproducibility.utils.common_utils import install_helm_cmds
from dags.map_reproducibility.utils.common_utils import namespace_cmds
from dags.map_reproducibility.utils.common_utils import wait_for_jobs_cmds
from dags.map_reproducibility.utils.common_utils import copy_bucket_cmds
from dags.map_reproducibility.utils.common_utils import cleanup_cmds
from dags.map_reproducibility.utils.common_utils import git_cookie_authdaemon
from dags.map_reproducibility.utils.common_utils import clone_recipes_gob
from dags.map_reproducibility.utils.common_utils import helm_apply_cmds
from dags.map_reproducibility.utils.common_utils import get_metrics
from dags.map_reproducibility.utils.common_utils import get_aotc_repo
from dags.map_reproducibility.utils.benchmarkdb_utils import write_run
from dags.map_reproducibility.utils.common_utils import extract_run_details
from dags.map_reproducibility.utils.common_utils import extract_gpus
from dags.map_reproducibility.utils.common_utils import get_accelerator_type
from dags.map_reproducibility.utils.common_utils import get_pre_workload_cmds
from dags.map_reproducibility.utils.common_utils import get_gpu_recipe_cmd


# Run once a day at 2 pm UTC (6 am PST)
SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None

MODEL_ID = "gpt3-175b"
PRECISION = "fp8"
HYPERCOMPUTER = "a3mega"
FRAMEWORK = "nemo"
VALUE_YAML_PATH = (
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
)
CLUSTER = "a3plus-benchmark"
CLUSTER_REGION = "australia-southeast1"
SOFTWARE_ID = "pytorch_nemo"
IMAGE_VERSION = "nemo_workload:24.07"


@task
def run_aotc_workload():
with tempfile.TemporaryDirectory() as tmpdir:
hook = SubprocessHook()

result = hook.run_command(
[
"bash",
"-c",
";".join(
git_cookie_authdaemon() + clone_recipes_gob() + get_aotc_repo()
),
],
cwd=tmpdir,
)

recipe_repo_root = os.path.join(
tmpdir, "reproducible-benchmark-recipes/projects/gpu-recipes"
)
aotc_repo_root = os.path.join(tmpdir, "benchmark-automation/aotc/src")
num_gpus = extract_gpus(recipe_repo_root, VALUE_YAML_PATH)
config_yaml_path = f"src/frameworks/{HYPERCOMPUTER}/nemo-configs/{MODEL_ID}-{num_gpus}gpus-{PRECISION}.yaml"
full_config_yaml_path = os.path.join(recipe_repo_root, config_yaml_path)

(
global_batch_size,
optimizer,
precision,
seq_length,
max_steps,
) = extract_run_details(recipe_repo_root, config_yaml_path)

accelerator_type = get_accelerator_type(HYPERCOMPUTER)
print(
f"batch size: {global_batch_size}, num gpus: {num_gpus}, precision: {precision}, seq length: {seq_length}, max steps: {max_steps}"
)

result = hook.run_command(
[
"bash",
"-c",
";".join(
configure_project_and_cluster(CLUSTER, CLUSTER_REGION)
+ get_gpu_recipe_cmd(
HYPERCOMPUTER, MODEL_ID, FRAMEWORK, recipe_repo_root
)
+ install_helm_cmds()
+ namespace_cmds()
+ get_pre_workload_cmds(
MODEL_ID, FRAMEWORK, IMAGE_VERSION
)
+ helm_apply_cmds(
FRAMEWORK,
HYPERCOMPUTER,
full_config_yaml_path,
recipe_repo_root
)
+ wait_for_jobs_cmds()
+ copy_bucket_cmds(recipe_repo_root)
+ get_metrics_cmds(
global_batch_size,
num_gpus,
PRECISION,
MODEL_ID,
accelerator_type,
tmpdir,
)
+ cleanup_cmds()
),
],
cwd=tmpdir,
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"

average_step_time, mfu = get_metrics(tmpdir)

write_run(
model_id=MODEL_ID,
hardware_id=HYPERCOMPUTER,
software_id=SOFTWARE_ID,
number_of_nodes=num_gpus / 8,
number_of_chips=num_gpus,
container_image_name=IMAGE_VERSION,
global_batch_size=global_batch_size,
precision=precision,
optimizer=optimizer,
seq_length=seq_length,
median_step_time=average_step_time,
e2e_time=0,
number_of_steps=max_steps,
mfu=mfu,
tokens_per_second=1,
writer_path=aotc_repo_root,
topology="2X2",
comment="Regression tests",
is_test=False,
)


with models.DAG(
dag_id=f"{HYPERCOMPUTER}_recipes_{MODEL_ID}_{FRAMEWORK}",
schedule=SCHEDULED_TIME,
tags=[
"simple",
"aotc",
"nightly",
"reproducibility",
"experimental",
"xlml",
],
start_date=datetime.datetime(2024, 11, 15),
catchup=False,
) as dag:
run_aotc_workload()
175 changes: 175 additions & 0 deletions dags/map_reproducibility/a3ultra_mixtral_nemo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""DAGs to run Aotc reproducibility benchmarks."""

import datetime
import sys
import os
import tempfile

from airflow import models
from airflow.decorators import task
from airflow.hooks.subprocess import SubprocessHook
from dags import composer_env
from dags.map_reproducibility.utils.common_utils import get_metrics_cmds
from dags.map_reproducibility.utils.common_utils import configure_project_and_cluster
from dags.map_reproducibility.utils.common_utils import install_helm_cmds
from dags.map_reproducibility.utils.common_utils import namespace_cmds
from dags.map_reproducibility.utils.common_utils import wait_for_jobs_cmds
from dags.map_reproducibility.utils.common_utils import copy_bucket_cmds
from dags.map_reproducibility.utils.common_utils import cleanup_cmds
from dags.map_reproducibility.utils.common_utils import git_cookie_authdaemon
from dags.map_reproducibility.utils.common_utils import clone_recipes_gob
from dags.map_reproducibility.utils.common_utils import helm_apply_cmds
from dags.map_reproducibility.utils.common_utils import get_metrics
from dags.map_reproducibility.utils.common_utils import get_aotc_repo
from dags.map_reproducibility.utils.benchmarkdb_utils import write_run
from dags.map_reproducibility.utils.common_utils import extract_run_details
from dags.map_reproducibility.utils.common_utils import extract_gpus
from dags.map_reproducibility.utils.common_utils import get_accelerator_type
from dags.map_reproducibility.utils.common_utils import get_pre_workload_cmds
from dags.map_reproducibility.utils.common_utils import get_gpu_recipe_cmd

# Run once a day at 2 pm UTC (6 am PST)
SCHEDULED_TIME = "0 14 * * *" if composer_env.is_prod_env() else None

MODEL_ID = "mixtral-8x7b"
PRECISION = "bf16"
HYPERCOMPUTER = "a3ultra"
FRAMEWORK = "nemo"
VALUE_YAML_PATH = (
f"training/{HYPERCOMPUTER}/{MODEL_ID}/nemo-pretraining-gke/values.yaml"
)
CLUSTER = "gke-a3ultra-map"
CLUSTER_REGION = "europe-west1"
SOFTWARE_ID = "pytorch_nemo"
IMAGE_VERSION = "nemo_workload:24.07"


@task
def run_aotc_workload():
with tempfile.TemporaryDirectory() as tmpdir:
hook = SubprocessHook()

result = hook.run_command(
[
"bash",
"-c",
";".join(
git_cookie_authdaemon() + clone_recipes_gob() + get_aotc_repo()
),
],
cwd=tmpdir,
)

recipe_repo_root = os.path.join(
tmpdir, "reproducible-benchmark-recipes/projects/gpu-recipes"
)
aotc_repo_root = os.path.join(tmpdir, "benchmark-automation/aotc/src")
num_gpus = extract_gpus(recipe_repo_root, VALUE_YAML_PATH)
num_gpus_temp = 256
config_yaml_path = f"src/frameworks/{HYPERCOMPUTER}/nemo-configs/{MODEL_ID}-{num_gpus_temp}gpus-a3u-{PRECISION}.yaml"
full_config_yaml_path = os.path.join(recipe_repo_root, config_yaml_path)

(
global_batch_size,
optimizer,
precision,
seq_length,
max_steps,
) = extract_run_details(recipe_repo_root, config_yaml_path)

accelerator_type = get_accelerator_type(HYPERCOMPUTER)
print(
f"batch size: {global_batch_size}, num gpus: {num_gpus}, precision: {precision}, seq length: {seq_length}, max steps: {max_steps}"
)

result = hook.run_command(
[
"bash",
"-c",
";".join(
configure_project_and_cluster(CLUSTER, CLUSTER_REGION)
+ get_gpu_recipe_cmd(
HYPERCOMPUTER, MODEL_ID, FRAMEWORK, recipe_repo_root
)
+ install_helm_cmds()
+ namespace_cmds()
+ get_pre_workload_cmds(
MODEL_ID, FRAMEWORK, IMAGE_VERSION
)
+ helm_apply_cmds(
FRAMEWORK,
HYPERCOMPUTER,
full_config_yaml_path,
recipe_repo_root
)
+ wait_for_jobs_cmds()
+ copy_bucket_cmds(recipe_repo_root)
+ get_metrics_cmds(
global_batch_size,
num_gpus,
PRECISION,
MODEL_ID,
accelerator_type,
tmpdir,
)
+ cleanup_cmds()
),
],
cwd=tmpdir,
)
assert result.exit_code == 0, f"Command failed with code {result.exit_code}"

average_step_time, mfu = get_metrics(tmpdir)

write_run(
model_id=MODEL_ID,
hardware_id=HYPERCOMPUTER,
software_id=SOFTWARE_ID,
number_of_nodes=num_gpus / 8,
number_of_chips=num_gpus,
container_image_name=IMAGE_VERSION,
global_batch_size=global_batch_size,
precision=precision,
optimizer=optimizer,
seq_length=seq_length,
median_step_time=average_step_time,
e2e_time=0,
number_of_steps=max_steps,
mfu=mfu,
tokens_per_second=1,
writer_path=aotc_repo_root,
topology="2X2",
comment="Regression tests",
is_test=False,
)


with models.DAG(
dag_id=f"{HYPERCOMPUTER}_recipes_{MODEL_ID}_{FRAMEWORK}",
schedule=SCHEDULED_TIME,
tags=[
"simple",
"aotc",
"nightly",
"reproducibility",
"experimental",
"xlml",
],
start_date=datetime.datetime(2024, 11, 15),
catchup=False,
) as dag:
run_aotc_workload()
Loading