Skip to content

Commit

Permalink
Merge pull request #208 from mlcommons/mlperf-inference
Browse files Browse the repository at this point in the history
Mlperf inference
  • Loading branch information
arjunsuresh authored Sep 13, 2024
2 parents 5b20820 + 0cf5b7e commit ad11c90
Show file tree
Hide file tree
Showing 37 changed files with 445 additions and 151 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test-cm-tutorial-tvm-pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: CM tutorial tvm pip install

on:
pull_request:
branches: [ "main", "test" ]
branches: [ "main", "test", "mlperf-inference" ]
paths:
- '.github/workflows/test-cm-tutorial-tvm-pip.yml'
- '**'
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test-mlperf-inference-abtf-poc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
cm pull repo mlcommons@cm4abtf --branch=poc
- name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on docker
run: |
cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=5 --adr.compiler.tags=gcc --quiet -v
cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=5 --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet -v
build2:
runs-on: ${{ matrix.os }}
Expand All @@ -62,7 +62,7 @@ jobs:
cm pull repo mlcommons@cm4abtf --branch=poc
- name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on ${{ matrix.os }}
run: |
cm run script --tags=run-abtf,inference,_poc-demo --adr.compiler.tags=gcc --quiet -v
cm run script --tags=run-abtf,inference,_poc-demo --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet -v
build3:
runs-on: ${{ matrix.os }}
Expand All @@ -89,4 +89,4 @@ jobs:
cm pull repo mlcommons@cm4abtf --branch=poc
- name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on ${{ matrix.os }}
run: |
cm run script --tags=run-abtf,inference,_poc-demo --quiet --env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off -v
cm run script --tags=run-abtf,inference,_poc-demo --quiet --env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 -v
9 changes: 5 additions & 4 deletions automation/script/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -2340,7 +2340,9 @@ def search(self, i):
# Print filtered paths if console
if console:
for script in r['list']:
logging.info(script.path)
# This should not be logging since the output can be consumed by other external tools and scripts
# logging.info(script.path)
print (script.path)

# Finalize output
r['script_tags'] = script_tags
Expand All @@ -2355,7 +2357,7 @@ def test(self, i):
Test automation (TBD)
Args:
(CM input dict):
(CM input dict):
(out) (str): if 'con', output to console
Expand Down Expand Up @@ -2641,8 +2643,7 @@ def add(self, i):
if k in ii: del ii[k]

if artifact_repo != None:
artifact = ii.get('artifact','')
ii['artifact'] = utils.assemble_cm_object2(artifact_repo) + ':' + artifact
ii['artifact'] = utils.assemble_cm_object2(artifact_repo) + ':' + utils.assemble_cm_object2(artifact_repo)

r_obj=self.cmind.access(ii)
if r_obj['return']>0: return r_obj
Expand Down
4 changes: 4 additions & 0 deletions script/app-mlperf-inference-amd/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ def preprocess(i):
if env.get('CM_MLPERF_SKIP_RUN', '') == "yes":
return {'return':0}

env['CM_MLPERF_AMD_SCRIPT_PATH'] = env['CM_TMP_CURRENT_SCRIPT_PATH']
env['CM_MLPERF_AMD_CODE_PATH'] = os.path.join(env['CM_MLPERF_INFERENCE_IMPLEMENTATION_REPO'], "closed", "AMD")

if 'CM_MODEL' not in env:
return {'return': 1, 'error': 'Please select a variation specifying the model to run'}
if 'CM_MLPERF_BACKEND' not in env:
Expand All @@ -22,6 +25,7 @@ def preprocess(i):

if "llama2" in env['CM_MODEL']:
env['CM_RUN_DIR'] = i['run_script_input']['path']
env['CM_MLPERF_AMD_LLAMA2_CODE_PATH'] = os.path.join(env['CM_MLPERF_AMD_CODE_PATH'], "llama2-70b-99.9/VllmFp8")
env['CM_RUN_CMD'] = "bash run-llama2.sh "
else:
return {'return':1, 'error':'Model {} not supported'.format(env['CM_MODEL'])}
Expand Down
56 changes: 56 additions & 0 deletions script/app-mlperf-inference-amd/run-llama2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

set -xeu

N_SAMPLES=${N_SAMPLES:-24576} #24576 #3072 #2457 #6
TP=1
DP=${DP:-8}
WD=${WD:-0}
SORTING=${SORTING:-descending} #ascending #descending #lexicographic #skip

export HIP_FORCE_DEV_KERNARG=1
export VLLM_USE_TRITON_FLASH_ATTN=0
export VLLM_FP8_PADDING=1
export VLLM_FP8_ACT_PADDING=1
export VLLM_FP8_WEIGHT_PADDING=1
export VLLM_FP8_REDUCE_CONV=1
export VLLM_SCHED_PREFILL_KVC_FREEPCT=31.0

export HARNESS_DISABLE_VLLM_LOGS=1
export VLLM_LOGGING_LEVEL=ERROR

MODEL_PATH=${CM_ML_MODEL_LLAMA2_FILE_WITH_PATH:-/data/llm/llama2-70b-chat/}
DATASET_PATH=${CM_DATASET_OPENORCA_PREPROCESSED_PATH:-/data/open_orca/open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz}
QUANTIZED_WEIGHTS_PATH=${CM_LLAMA2_FINAL_SAFE_TENSORS_PATH:-quantized/quark_share/modelzoo/llama2_70b_wfp8_afp8_ofp8_nomerge/json-safetensors/llama.safetensors}
QUANTIZATION_PARAM_PATH=${QUANTIZATION_PARAM_PATH:-/app/kv_cache_scales.json}

MLPERF_CONF="${CM_MLPERF_CONF:-/app/mlperf_inference/mlperf.conf}"
USER_CONF="${CM_MLPERF_USER_CONF:-/lab-mlperf-inference/code/llama2-70b-99.9/mlperf_config_VllmFp8/user.conf}"

SUBMISSION=${SUBMISSION:-0}

LOG_DIR=${CM_MLPERF_OUTPUT_DIR}

cp $USER_CONF ${LOG_DIR}/user.conf

cmd ="${CM_PYTHON_BIN_WITH_PATH} ${CM_MLPERF_AMD_LLAMA2_CODE_PATH}/mainVllmFp8_Offline.py \
--scenario ${CM_MLPERF_LOADGEN_SCENARIO \
--output-log-dir ${LOG_DIR} \
--model-path $MODEL_PATH \
--mlperf-conf $MLPERF_CONF \
--user-conf $USER_CONF \
--total-sample-count $N_SAMPLES \
--dataset-path $DATASET_PATH \
--dtype float16 \
--backend vllm \
--device cuda:0 \
--kv-cache-dtype fp8 \
-tp ${TP} \
-dp ${DP} \
--quantization fp8 \
--quantized-weights-path ${QUANTIZED_WEIGHTS_PATH} \
--quantization-param-path ${QUANTIZATION_PARAM_PATH} \
--warmup-duration ${WD} \
--sorting ${SORTING} \
--enforce-eager True \
--gpu-memory-utilization 0.99"
3 changes: 3 additions & 0 deletions script/app-mlperf-inference-mlcommons-python/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,9 @@ variations:
CM_MLPERF_BACKEND_VERSION: <<<CM_ONNXRUNTIME_VERSION>>>
deps:
- tags: get,generic-python-lib,_onnx
- tags: get,generic-python-lib,_numpy
version_max: "1.26.4"
version_max_usable: "1.26.4"
- tags: get,tvm
names:
- tvm
Expand Down
17 changes: 16 additions & 1 deletion script/app-mlperf-inference-nvidia/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,16 @@ post_deps:
# Variations to customize dependencies
variations:
# MLPerf inference version
v4.0:
v4.1:
group: version
default: true
env:
CM_MLPERF_INFERENCE_CODE_VERSION: "v4.1"
adr:
pytorch:
tags: _for-nvidia-mlperf-inference-v4.1
v4.0:
group: version
env:
CM_MLPERF_INFERENCE_CODE_VERSION: "v4.0"
CM_MLPERF_GPTJ_MODEL_FP8_PATH_SUFFIX: GPTJ-FP8-quantized
Expand Down Expand Up @@ -455,6 +462,14 @@ variations:
- scipy
version: 1.10.1

sdxl,v4.1:
deps:
- tags: get,generic-python-lib,_package.torchrec
version: 0.4.0
- tags: get,generic-python-lib,_package.torchmetrics
version: 1.0.3
- tags: get,generic-python-lib,_package.typeguard

bert_:
deps:
- tags: get,generic-python-lib,_transformers
Expand Down
20 changes: 15 additions & 5 deletions script/app-mlperf-inference/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ variations:

nvidia-original,r4.1_default:
docker:
base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v4.0-cuda12.2-cudnn8.9-x86_64-ubuntu20.04-public
base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v4.1-cuda12.4-pytorch24.04-ubuntu22.04-x86_64-release

nvidia-original,r4.1_default,gptj_:
docker:
Expand Down Expand Up @@ -349,6 +349,8 @@ variations:
os_version: "20.04"
deps:
- tags: get,mlperf,inference,nvidia,scratch,space
names:
- mlperf-inference-nvidia-scratch-space
- tags: get,nvidia-docker
skip_if_env:
CM_SKIP_GET_NVIDIA_DOCKER:
Expand Down Expand Up @@ -1114,6 +1116,9 @@ variations:
all_gpus: 'yes'
deps:
- tags: get,nvidia-docker
skip_if_env:
CM_SKIP_GET_NVIDIA_DOCKER:
- yes
group:
device
env:
Expand Down Expand Up @@ -1415,19 +1420,19 @@ variations:
reproducibility
add_deps_recursive:
nvidia-inference-common-code:
version: r4.0
version: r4.1
tags: _go
nvidia-inference-server:
version: r4.0
version: r4.1
tags: _go
intel-harness:
tags: _v4.0
tags: _v4.1
default_env:
CM_SKIP_SYS_UTILS: 'yes'
CM_REGENERATE_MEASURE_FILES: 'yes'
env:
CM_ENV_NVMITTEN_DOCKER_WHEEL_PATH: '/opt/nvmitten-0.1.3b0-cp38-cp38-linux_x86_64.whl'

CM_MLPERF_INFERENCE_VERSION: '4.1'

invalid_variation_combinations:
-
Expand Down Expand Up @@ -1523,7 +1528,11 @@ docker:
use_host_user_id: True
deps:
- tags: get,mlperf,inference,results,dir,local
names:
- get-mlperf-inference-results-dir
- tags: get,mlperf,inference,submission,dir,local
names:
- get-mlperf-inference-submission-dir
pre_run_cmds:
#- cm pull repo && cm run script --tags=get,git,repo,_repo.https://github.com/GATEOverflow/inference_results_v4.0.git --update
- cm pull repo
Expand All @@ -1536,6 +1545,7 @@ docker:
- "${{ LLAMA2_CHECKPOINT_PATH }}:${{ LLAMA2_CHECKPOINT_PATH }}"
- "${{ DLRM_DATA_PATH }}:/home/mlperf_inf_dlrmv2"
- "${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}:${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}"
- "${{ SDXL_CHECKPOINT_PATH }}:${{ SDXL_CHECKPOINT_PATH }}"
skip_run_cmd: 'no'
shm_size: '32gb'
interactive: True
Expand Down
2 changes: 1 addition & 1 deletion script/app-mlperf-inference/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def postprocess(i):
cm_sut_info['device'] = env['CM_MLPERF_DEVICE']
cm_sut_info['framework'] = state['CM_SUT_META']['framework']
cm_sut_info['run_config'] = env['CM_MLPERF_INFERENCE_SUT_RUN_CONFIG']
with open(os.path.join(result_sut_folder_path,"cm_sut_info.json"), "w") as fp:
with open(os.path.join(result_sut_folder_path,"cm-sut-info.json"), "w") as fp:
json.dump(cm_sut_info, fp, indent=2)

system_meta = state['CM_SUT_META']
Expand Down
2 changes: 1 addition & 1 deletion script/benchmark-program/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def preprocess(i):
# generate the post run cmd - for killing the process that records runtime system infos
post_run_cmd = ""
if env.get('CM_PROFILE_NVIDIA_POWER', '') == "on":
post_run_cmd += "echo 'killing process \${cmd_pid}' && kill -TERM \${cmd_pid}"
post_run_cmd += "echo killing process \$cmd_pid && kill -TERM \${cmd_pid}"
print(f"Post run command for killing the process that measures the runtime system information: {post_run_cmd}")

env['CM_POST_RUN_CMD'] = post_run_cmd
Expand Down
24 changes: 16 additions & 8 deletions script/benchmark-program/run.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,20 @@
#!/bin/bash

# function to safely exit the background process
safe_exit() {
if [[ "${CM_POST_RUN_CMD}" != "" ]]; then
eval ${CM_POST_RUN_CMD}
if [ $? -eq 0 ]; then
exit 0
else
exit $?
fi
fi
}

# trap signals to redirect the execution flow to safe_exit
trap safe_exit SIGINT SIGTERM

if [[ ${CM_MLPERF_POWER} == "yes" && ${CM_MLPERF_LOADGEN_MODE} == "performance" ]]; then
exit 0
fi
Expand Down Expand Up @@ -45,18 +61,10 @@ eval ${CM_PRE_RUN_CMD}
if [[ "${CM_RUN_CMD0}" != "" ]]; then
eval ${CM_RUN_CMD0}
exitstatus=$?
if [ -e exitstatus ]; then
exitstatus=$( cat exitstatus )
fi
test $exitstatus -eq 0 || $exitstatus
else
echo "${CM_RUN_CMD}"
eval ${CM_RUN_CMD}
exitstatus=$?
if [ -e exitstatus ]; then
exitstatus=$( cat exitstatus )
fi
test $exitstatus -eq 0 || $exitstatus
fi

eval ${CM_POST_RUN_CMD}
Expand Down
30 changes: 29 additions & 1 deletion script/build-mlperf-inference-server-nvidia/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,6 @@ versions:
version: r3.0
nvidia-scratch-space:
tags: _version.3_0

r3.1:
add_deps_recursive:
nvidia-inference-common-code:
Expand All @@ -237,6 +236,26 @@ versions:
- tags: install,nccl,libs,_cuda

r4.0:
add_deps_recursive:
nvidia-inference-common-code:
version: r4.0
nvidia-scratch-space:
tags: _version.4_0
default_env:
BUILD_TRTLLM: 1
deps:
- tags: get,generic,sys-util,_nlohmann-json3-dev
- tags: get,generic,sys-util,_git-lfs
- tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v4.0
names:
- pytorch
- torch
- tags: install,torchvision,from.src,_for-nvidia-mlperf-inference-v4.0
names:
- pytorchvision
- torchvision

r4.1-dev:
add_deps_recursive:
nvidia-inference-common-code:
version: r4.0
Expand All @@ -255,6 +274,15 @@ versions:
names:
- pytorchvision
- torchvision

r4.1:
add_deps_recursive:
nvidia-inference-common-code:
version: r4.1
nvidia-scratch-space:
tags: _version.4_1
default_env:
BUILD_TRTLLM: 1

docker:
skip_run_cmd: 'no'
Expand Down
3 changes: 2 additions & 1 deletion script/build-mlperf-inference-server-nvidia/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def preprocess(i):
if env.get('CM_GCC_VERSION', '') != '':
gcc_major_version = env['CM_GCC_VERSION'].split(".")[0]
if int(gcc_major_version) > 10:
cxxflags.append("-Wno-error=range-loop-construct")
if env.get('CM_MLPERF_INFERENCE_VERSION','') != "4.1":
cxxflags.append("-Wno-error=range-loop-construct")

if env.get('CM_MLPERF_DEVICE','') == "inferentia":
env['USE_INFERENTIA'] = "1"
Expand Down
Loading

0 comments on commit ad11c90

Please sign in to comment.