Skip to content

Commit

Permalink
Merge branch 'mlperf-inference' into nvidiaAllGhaction
Browse files Browse the repository at this point in the history
  • Loading branch information
anandhu-eng authored Oct 14, 2024
2 parents e375b7e + 052dd0d commit c8878df
Show file tree
Hide file tree
Showing 23 changed files with 197 additions and 42 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/test-mlperf-inference-dlrm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions

name: MLPerf inference DLRM-v2

on:
schedule:
- cron: "30 1 * * *"

jobs:
build_reference:
if: github.repository_owner == 'gateoverflow'
runs-on: [ self-hosted, GO-spr, linux, x64 ]
strategy:
fail-fast: false
matrix:
python-version: [ "3.12" ]
device: [ "cpu" ]

steps:
- name: Test MLPerf Inference DLRM-v2 reference implementation
run: |
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
python3 -m pip install cm4mlops
cm pull repo
cm run script --tags=run-mlperf,inference,_performance-only --submitter="MLCommons" --model=dlrm-v2-99 --implementation=reference --backend=pytorch --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --clean
build_intel:
if: github.repository_owner == 'gateoverflow_off'
runs-on: [ self-hosted, GO-spr, linux, x64 ]
strategy:
fail-fast: false
matrix:
python-version: [ "3.12" ]
backend: [ "pytorch" ]
device: [ "cpu" ]

steps:
- name: Test MLPerf Inference DLRM-v2 INTEL implementation
run: |
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
python3 -m pip install cm4mlops
cm pull repo
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=dlrm-v2-99 --implementation=intel --batch_size=1 --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
8 changes: 3 additions & 5 deletions .github/workflows/test-mlperf-inference-gptj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: MLPerf inference GPT-J

on:
schedule:
- cron: "1 2 * * *"
- cron: "15 19 * * *"

jobs:
build:
Expand All @@ -19,15 +19,13 @@ jobs:
precision: [ "float16" ]

steps:
- name: Install dependencies
- name: Test MLPerf Inference GPTJ
run: |
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
python3 -m pip install cm4mlops
cm pull repo
- name: Test MLPerf Inference GPTJ
run: |
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --get_platform_details=yes --implementation=reference --clean
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
7 changes: 3 additions & 4 deletions .github/workflows/test-mlperf-inference-sdxl.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
name: MLPerf inference SDXL
#off now as we have SCC24 test doing the same
on:
schedule:
- cron: "1 2 * * *"
- cron: "30 9 * * *"

jobs:
build_reference:
Expand All @@ -22,5 +21,5 @@ jobs:
export CM_REPOS=$HOME/GH_CM
python3 -m pip install cm4mlops
cm pull repo
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
4 changes: 2 additions & 2 deletions .github/workflows/test-scc24-sdxl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: MLPerf inference SDXL (SCC)

on:
schedule:
- cron: "1 3 * * *"
- cron: "5 2 * * *"

jobs:
build_reference:
Expand Down Expand Up @@ -52,7 +52,7 @@ jobs:
pip install --upgrade cm4mlops
pip install tabulate
cm pull repo
cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --hw_name=go-spr --clean
cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --pull_changes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --hw_name=go-spr --custom_system_nvidia=yes --clean
cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions
8 changes: 4 additions & 4 deletions script/app-mlperf-inference-intel/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -940,10 +940,10 @@ variations:
names:
- pip-package
- accelerate
- tags: get,generic-python-lib,_package.torch,_path.https://download.pytorch.org/whl/nightly/cpu-cxx11-abi/torch-2.1.0.dev20230715%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl
names:
- pip-package
- pip-torch
- tags: install,pytorch,from-src,_for-intel-mlperf-inference-v3.1-dlrm-v2
names:
- pytorch
- torch
dlrm-v2_:
env: {}

Expand Down
7 changes: 7 additions & 0 deletions script/app-mlperf-inference-nvidia/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,13 @@ deps:
names:
- nvidia-inference-common-code

- tags: pull,git,repo
env:
CM_GIT_CHECKOUT_PATH: '<<<CM_MLPERF_INFERENCE_NVIDIA_CODE_PATH>>>'
enable_if_env:
CM_MLPERF_INFERENCE_PULL_CODE_CHANGES:
- 'yes'

# Creates user conf for given SUT
- tags: generate,user-conf,mlperf,inference
names:
Expand Down
19 changes: 17 additions & 2 deletions script/app-mlperf-inference/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,17 @@ deps:

posthook_deps:
- tags: get,mlperf,sut,description #populate system meta information like framework
- tags: get,platform,details
enable_if_any_env:
CM_SUDO_USER:
- yes
CM_GET_PLATFORM_DETAILS:
- yes
skip_if_env:
CM_MLPERF_LOADGEN_MODE:
- accuracy
env:
CM_PLATFORM_DETAILS_FILE_PATH: '<<<CM_MLPERF_OUTPUT_DIR>>>/system_info.txt'

# Order of variations for documentation
variation_groups_order:
Expand Down Expand Up @@ -985,19 +996,24 @@ variations:
docker:
deps:
- tags: get,dlrm,data,mlperf,inference,_nvidia
mounts:
- "${{ DLRM_DATA_PATH }}:/home/mlperf_inf_dlrmv2"

dlrm_,intel:
docker:
deps:
- tags: get,preprocessed,dataset,criteo,_mlc
mounts:
- "${{ DLRM_DATA_PATH }}:${{ DLRM_DATA_PATH }}"

dlrm_,reference:
docker:
deps:
- tags: get,preprocessed,dataset,criteo,_mlc
- tags: get,ml-model,dlrm,_pytorch,_fp32
mounts:
- ${{ CM_ML_MODEL_FILE_WITH_PATH }}:${{ CM_ML_MODEL_FILE_WITH_PATH }}
- "${{ CM_ML_MODEL_FILE_WITH_PATH }}:${{ CM_ML_MODEL_FILE_WITH_PATH }}"
- "${{ DLRM_DATA_PATH }}:${{ DLRM_DATA_PATH }}"
dockerfile_env:
CM_ML_MODEL_FILE_WITH_PATH: "on"

Expand Down Expand Up @@ -1626,7 +1642,6 @@ docker:
- "${{ GPTJ_CHECKPOINT_PATH }}:${{ GPTJ_CHECKPOINT_PATH }}"
- "${{ CM_CRITEO_PREPROCESSED_PATH }}:${{ CM_CRITEO_PREPROCESSED_PATH }}"
- "${{ LLAMA2_CHECKPOINT_PATH }}:${{ LLAMA2_CHECKPOINT_PATH }}"
- "${{ DLRM_DATA_PATH }}:/home/mlperf_inf_dlrmv2"
- "${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}:${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}"
- "${{ SDXL_CHECKPOINT_PATH }}:${{ SDXL_CHECKPOINT_PATH }}"
- "${{ CM_DATASET_KITS19_PREPROCESSED_PATH }}:${{ CM_DATASET_KITS19_PREPROCESSED_PATH }}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ def preprocess(i):
if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'preprocessed_data':
clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "preprocessed_data", "coco2014-tokenized-sdxl")} """
cache_rm_tags = "nvidia-harness,_preprocess_data,_sdxl"
if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'downloaded_model':
clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "models", "SDXL")} """
cache_rm_tags = "nvidia-harness,_download_model,_sdxl"

cache_rm_tags = cache_rm_tags + extra_cache_rm_tags

Expand Down
27 changes: 26 additions & 1 deletion script/generate-mlperf-inference-submission/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,11 @@ def generate_submission(i):

results = {}

model_platform_info_file = None

for model in models:
results[model] = {}
platform_info_file = None
result_model_path = os.path.join(result_path, model)
submission_model_path = os.path.join(submission_path, model)
measurement_model_path = os.path.join(measurement_path, model)
Expand Down Expand Up @@ -386,8 +389,10 @@ def generate_submission(i):
files.append(f)
elif f == "spl.txt":
files.append(f)
elif f in [ "README.md", "README-extra.md", "cm-version-info.json", "os_info.json", "cpu_info.json", "pip_freeze.json" ] and mode == "performance":
elif f in [ "README.md", "README-extra.md", "cm-version-info.json", "os_info.json", "cpu_info.json", "pip_freeze.json", "system_info.txt" ] and mode == "performance":
shutil.copy(os.path.join(result_mode_path, f), os.path.join(submission_measurement_path, f))
if f == "system_info.txt" and not platform_info_file:
platform_info_file = os.path.join(result_mode_path, f)
elif f in [ "console.out" ]:
shutil.copy(os.path.join(result_mode_path, f), os.path.join(submission_measurement_path, mode+"_"+f))

Expand Down Expand Up @@ -417,6 +422,26 @@ def generate_submission(i):
with open(readme_file, mode='a') as f:
f.write(result_string)

#Copy system_info.txt to the submission measurements model folder if any scenario performance run has it
sys_info_file = None
if os.path.exists(os.path.join(result_model_path, "system_info.txt")):
sys_info_file = os.path.join(result_model_path, "system_info.txt")
elif platform_info_file:
sys_info_file = platform_info_file
if sys_info_file:
model_platform_info_file = sys_info_file
shutil.copy(sys_info_file, os.path.join(measurement_model_path, "system_info.txt"))

#Copy system_info.txt to the submission measurements folder if any model performance run has it
sys_info_file = None
if os.path.exists(os.path.join(result_path, "system_info.txt")):
sys_info_file = os.path.join(result_path, "system_info.txt")
elif model_platform_info_file:
sys_info_file = model_platform_info_file
if sys_info_file:
shutil.copy(sys_info_file, os.path.join(measurement_path, "system_info.txt"))


with open(system_file, "w") as fp:
json.dump(system_meta, fp, indent=2)

Expand Down
4 changes: 2 additions & 2 deletions script/get-cuda-devices/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def postprocess(i):
key_env = 'CM_CUDA_DEVICE_PROP_'+key.upper().replace(' ','_')
env[key_env] = val

state['cm_cuda_num_devices'] = gpu_id
env['CM_CUDA_NUM_DEVICES'] = gpu_id
state['cm_cuda_num_devices'] = gpu_id + 1
env['CM_CUDA_NUM_DEVICES'] = gpu_id + 1

state['cm_cuda_device_prop'] = p
state['cm_cuda_devices_prop'] = gpu
Expand Down
20 changes: 17 additions & 3 deletions script/get-generic-python-lib/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -874,12 +874,26 @@
"deps": [
{
"tags": "get,generic-python-lib,_package.networkx",
"enable_if_env": {
"CM_PYTHON_MINOR_VERSION": [ "7", "8" ]
}
"enable_if_env": {
"CM_PYTHON_MINOR_VERSION": [ "7", "8" ]
}
}
]
},
"cxx11-abi": {
"env": {
}
},
"torch,cxx11-abi": {
"env": {
"CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL": "https://download.pytorch.org/whl/nightly/cpu-cxx11-abi"
}
},
"package.torch,cxx11-abi": {
"env": {
"CM_GENERIC_PYTHON_PIP_INDEX_URL": "https://download.pytorch.org/whl/nightly/cpu-cxx11-abi"
}
},
"torch,pre": {
"default_env": {
"CM_GENERIC_PYTHON_PIP_UNINSTALL_DEPS": "torch"
Expand Down
1 change: 1 addition & 0 deletions script/get-generic-python-lib/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def preprocess(i):

# Check extra index URL
extra_index_url = env.get('CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL','').strip()

if extra_index_url != '':
# Check special cases
if '${CM_TORCH_CUDA}' in extra_index_url:
Expand Down
5 changes: 3 additions & 2 deletions script/get-platform-details/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ def preprocess(i):
if os_info['platform'] == "windows":
return {'return':1, 'error':'get-platform-details script not yet supported in windows!'}

print(env['CM_HOST_OS_KERNEL_VERSION'])

if not check_installation("numactl",os_info):
env['CM_INSTALL_NUMACTL'] = 'True'

#if not check_installation("cpupower",os_info):
env['CM_INSTALL_CPUPOWER'] = 'True'

if env.get('CM_PLATFORM_DETAILS_FILE_PATH', '') == '':
env['CM_PLATFORM_DETAILS_FILE_PATH'] = os.path.join(os.getcwd(), "system_info.txt")

return {'return':0}


Expand Down
15 changes: 8 additions & 7 deletions script/get-platform-details/run.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#!/bin/bash

OUTPUT_FILE="system_info.txt"

echo "WARNING: sudo permission is needed to some packages for measuring the platform details"
OUTPUT_FILE="$CM_PLATFORM_DETAILS_FILE_PATH"
#set -e
#echo $OUTPUT_FILE
echo "WARNING: sudo permission is needed for some of the below commands"

if [[ ${CM_HOST_OS_FLAVOR} == "macos" ]]; then
echo "WARNING: To be done for the mac os"
Expand Down Expand Up @@ -46,7 +47,7 @@ else

echo "8. numactl --hardware" >> $OUTPUT_FILE
eval "numactl --hardware" >> $OUTPUT_FILE
test $? -eq 0 || exit $?
#test $? -eq 0 || exit $?
echo "------------------------------------------------------------" >> $OUTPUT_FILE

echo "9. /proc/meminfo" >> $OUTPUT_FILE
Expand Down Expand Up @@ -81,7 +82,7 @@ else

echo "15. sysctl" >> $OUTPUT_FILE
eval "sudo sysctl -a" >> $OUTPUT_FILE
test $? -eq 0 || exit $?
#test $? -eq 0 || exit $?
echo "------------------------------------------------------------" >> $OUTPUT_FILE

echo "16. /sys/kernel/mm/transparent_hugepage" >> $OUTPUT_FILE
Expand Down Expand Up @@ -111,12 +112,12 @@ else

echo "21. dmidecode" >> $OUTPUT_FILE
eval "sudo dmidecode" >> $OUTPUT_FILE
test $? -eq 0 || exit $?
#test $? -eq 0 || exit $?
echo "------------------------------------------------------------" >> $OUTPUT_FILE

echo "22. BIOS" >> $OUTPUT_FILE
eval "sudo dmidecode -t bios" >> $OUTPUT_FILE
test $? -eq 0 || exit $?
#test $? -eq 0 || exit $?
echo "------------------------------------------------------------" >> $OUTPUT_FILE

echo "System information has been saved to $PWD/$OUTPUT_FILE"
Expand Down
6 changes: 3 additions & 3 deletions script/install-ipex-from-src/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -214,10 +214,10 @@
"version": "1.23.5"
},
{
"tags": "get,generic-python-lib,_package.torch,_path.https://download.pytorch.org/whl/nightly/cpu-cxx11-abi/torch-2.1.0.dev20230715%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl",
"tags": "install,pytorch,from-src,_for-intel-mlperf-inference-v3.1-dlrm-v2",
"names": [
"pip-package",
"pip-torch"
"pytorch",
"torch"
]
}
],
Expand Down
Loading

0 comments on commit c8878df

Please sign in to comment.