Merge pull request #208 from mlcommons/mlperf-inference

Mlperf inference
mlcommons · Sep 13, 2024 · ad11c90 · ad11c90
2 parents 5b20820 + 0cf5b7e
commit ad11c90
Show file tree

Hide file tree

Showing 37 changed files with 445 additions and 151 deletions.
diff --git a/.github/workflows/test-cm-tutorial-tvm-pip.yml b/.github/workflows/test-cm-tutorial-tvm-pip.yml
@@ -5,7 +5,7 @@ name: CM tutorial tvm pip install
 
 on:
   pull_request:
-    branches: [ "main", "test" ]
+    branches: [ "main", "test", "mlperf-inference" ]
     paths:
       - '.github/workflows/test-cm-tutorial-tvm-pip.yml'
       - '**'

diff --git a/.github/workflows/test-mlperf-inference-abtf-poc.yml b/.github/workflows/test-mlperf-inference-abtf-poc.yml
@@ -37,7 +37,7 @@ jobs:
         cm pull repo mlcommons@cm4abtf --branch=poc
     - name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on docker
       run: |
-        cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=5 --adr.compiler.tags=gcc --quiet  -v
+        cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=5 --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet  -v
 
   build2:
     runs-on: ${{ matrix.os }}
@@ -62,7 +62,7 @@ jobs:
         cm pull repo mlcommons@cm4abtf --branch=poc
     - name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on ${{ matrix.os }}
       run: |
-        cm run script --tags=run-abtf,inference,_poc-demo --adr.compiler.tags=gcc --quiet  -v
+        cm run script --tags=run-abtf,inference,_poc-demo --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet  -v
     
   build3:
     runs-on: ${{ matrix.os }}
@@ -89,4 +89,4 @@ jobs:
         cm pull repo mlcommons@cm4abtf --branch=poc
     - name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on ${{ matrix.os }}
       run: |
-        cm run script --tags=run-abtf,inference,_poc-demo --quiet --env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off  -v
+        cm run script --tags=run-abtf,inference,_poc-demo --quiet --env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7  -v
diff --git a/automation/script/module.py b/automation/script/module.py
@@ -2340,7 +2340,9 @@ def search(self, i):
         # Print filtered paths if console
         if console:
             for script in r['list']:
-                logging.info(script.path)
+#                This should not be logging since the output can be consumed by other external tools and scripts
+#                logging.info(script.path)
+                 print (script.path)
 
         # Finalize output
         r['script_tags'] = script_tags
@@ -2355,7 +2357,7 @@ def test(self, i):
         Test automation (TBD)
 
         Args:
-          (CM input dict): 
+          (CM input dict):
 
           (out) (str): if 'con', output to console
 
@@ -2641,8 +2643,7 @@ def add(self, i):
             if k in ii: del ii[k]
 
         if artifact_repo != None:
-            artifact = ii.get('artifact','')
-            ii['artifact'] = utils.assemble_cm_object2(artifact_repo) + ':' + artifact
+            ii['artifact'] = utils.assemble_cm_object2(artifact_repo) + ':' + utils.assemble_cm_object2(artifact_repo)
 
         r_obj=self.cmind.access(ii)
         if r_obj['return']>0: return r_obj

diff --git a/script/app-mlperf-inference-amd/customize.py b/script/app-mlperf-inference-amd/customize.py
@@ -13,6 +13,9 @@ def preprocess(i):
     if env.get('CM_MLPERF_SKIP_RUN', '') == "yes":
         return {'return':0}
 
+    env['CM_MLPERF_AMD_SCRIPT_PATH'] = env['CM_TMP_CURRENT_SCRIPT_PATH']
+    env['CM_MLPERF_AMD_CODE_PATH'] = os.path.join(env['CM_MLPERF_INFERENCE_IMPLEMENTATION_REPO'], "closed", "AMD")
+
     if 'CM_MODEL' not in env:
         return {'return': 1, 'error': 'Please select a variation specifying the model to run'}
     if 'CM_MLPERF_BACKEND' not in env:
@@ -22,6 +25,7 @@ def preprocess(i):
 
     if "llama2" in env['CM_MODEL']:
         env['CM_RUN_DIR'] = i['run_script_input']['path']
+        env['CM_MLPERF_AMD_LLAMA2_CODE_PATH'] = os.path.join(env['CM_MLPERF_AMD_CODE_PATH'], "llama2-70b-99.9/VllmFp8")
         env['CM_RUN_CMD'] = "bash run-llama2.sh "
     else:
         return {'return':1, 'error':'Model {} not supported'.format(env['CM_MODEL'])}

diff --git a/script/app-mlperf-inference-amd/run-llama2.sh b/script/app-mlperf-inference-amd/run-llama2.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -xeu
+
+N_SAMPLES=${N_SAMPLES:-24576} #24576 #3072 #2457 #6
+TP=1
+DP=${DP:-8}
+WD=${WD:-0}
+SORTING=${SORTING:-descending} #ascending #descending #lexicographic #skip
+
+export HIP_FORCE_DEV_KERNARG=1
+export VLLM_USE_TRITON_FLASH_ATTN=0
+export VLLM_FP8_PADDING=1
+export VLLM_FP8_ACT_PADDING=1
+export VLLM_FP8_WEIGHT_PADDING=1
+export VLLM_FP8_REDUCE_CONV=1
+export VLLM_SCHED_PREFILL_KVC_FREEPCT=31.0
+
+export HARNESS_DISABLE_VLLM_LOGS=1
+export VLLM_LOGGING_LEVEL=ERROR
+
+MODEL_PATH=${CM_ML_MODEL_LLAMA2_FILE_WITH_PATH:-/data/llm/llama2-70b-chat/}
+DATASET_PATH=${CM_DATASET_OPENORCA_PREPROCESSED_PATH:-/data/open_orca/open_orca_gpt4_tokenized_llama.sampled_24576.pkl.gz}
+QUANTIZED_WEIGHTS_PATH=${CM_LLAMA2_FINAL_SAFE_TENSORS_PATH:-quantized/quark_share/modelzoo/llama2_70b_wfp8_afp8_ofp8_nomerge/json-safetensors/llama.safetensors}
+QUANTIZATION_PARAM_PATH=${QUANTIZATION_PARAM_PATH:-/app/kv_cache_scales.json}
+
+MLPERF_CONF="${CM_MLPERF_CONF:-/app/mlperf_inference/mlperf.conf}"
+USER_CONF="${CM_MLPERF_USER_CONF:-/lab-mlperf-inference/code/llama2-70b-99.9/mlperf_config_VllmFp8/user.conf}"
+
+SUBMISSION=${SUBMISSION:-0}
+
+LOG_DIR=${CM_MLPERF_OUTPUT_DIR}
+
+cp $USER_CONF ${LOG_DIR}/user.conf
+
+cmd ="${CM_PYTHON_BIN_WITH_PATH} ${CM_MLPERF_AMD_LLAMA2_CODE_PATH}/mainVllmFp8_Offline.py \
+    --scenario ${CM_MLPERF_LOADGEN_SCENARIO \
+    --output-log-dir ${LOG_DIR} \
+    --model-path $MODEL_PATH \
+    --mlperf-conf $MLPERF_CONF \
+    --user-conf $USER_CONF \
+    --total-sample-count $N_SAMPLES \
+    --dataset-path $DATASET_PATH \
+    --dtype float16 \
+    --backend vllm \
+    --device cuda:0 \
+    --kv-cache-dtype fp8 \
+    -tp ${TP} \
+    -dp ${DP} \
+    --quantization fp8 \
+    --quantized-weights-path ${QUANTIZED_WEIGHTS_PATH} \
+    --quantization-param-path ${QUANTIZATION_PARAM_PATH} \
+    --warmup-duration ${WD} \
+    --sorting ${SORTING} \
+    --enforce-eager True \
+    --gpu-memory-utilization 0.99" 
diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml
@@ -780,6 +780,9 @@ variations:
       CM_MLPERF_BACKEND_VERSION: <<<CM_ONNXRUNTIME_VERSION>>>
     deps:
     - tags: get,generic-python-lib,_onnx
+    - tags: get,generic-python-lib,_numpy
+      version_max: "1.26.4"
+      version_max_usable: "1.26.4"
     - tags: get,tvm
       names:
       - tvm

diff --git a/script/app-mlperf-inference-nvidia/_cm.yaml b/script/app-mlperf-inference-nvidia/_cm.yaml
@@ -316,9 +316,16 @@ post_deps:
 # Variations to customize dependencies
 variations:
   # MLPerf inference version
-  v4.0:
+  v4.1:
     group: version
     default: true
+    env:
+      CM_MLPERF_INFERENCE_CODE_VERSION: "v4.1"
+    adr:
+      pytorch:
+        tags: _for-nvidia-mlperf-inference-v4.1
+  v4.0:
+    group: version
     env:
       CM_MLPERF_INFERENCE_CODE_VERSION: "v4.0"
       CM_MLPERF_GPTJ_MODEL_FP8_PATH_SUFFIX: GPTJ-FP8-quantized
@@ -455,6 +462,14 @@ variations:
           - scipy
         version: 1.10.1
 
+  sdxl,v4.1:
+    deps:
+    - tags: get,generic-python-lib,_package.torchrec
+      version: 0.4.0
+    - tags: get,generic-python-lib,_package.torchmetrics
+      version: 1.0.3
+    - tags: get,generic-python-lib,_package.typeguard
+
   bert_:
     deps:
     - tags: get,generic-python-lib,_transformers

diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml
@@ -307,7 +307,7 @@ variations:
 
   nvidia-original,r4.1_default:
     docker:
-      base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v4.0-cuda12.2-cudnn8.9-x86_64-ubuntu20.04-public
+      base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v4.1-cuda12.4-pytorch24.04-ubuntu22.04-x86_64-release
 
   nvidia-original,r4.1_default,gptj_:
     docker:
@@ -349,6 +349,8 @@ variations:
       os_version: "20.04"
       deps:
         - tags: get,mlperf,inference,nvidia,scratch,space
+          names:
+            - mlperf-inference-nvidia-scratch-space
         - tags: get,nvidia-docker
           skip_if_env:
             CM_SKIP_GET_NVIDIA_DOCKER:
@@ -1114,6 +1116,9 @@ variations:
       all_gpus: 'yes'
       deps:
         - tags: get,nvidia-docker 
+          skip_if_env:
+            CM_SKIP_GET_NVIDIA_DOCKER:
+              - yes
     group:
       device
     env:
@@ -1415,19 +1420,19 @@ variations:
       reproducibility
     add_deps_recursive:
       nvidia-inference-common-code:
-        version: r4.0
+        version: r4.1
         tags: _go
       nvidia-inference-server:
-        version: r4.0
+        version: r4.1
         tags: _go
       intel-harness:
-        tags: _v4.0
+        tags: _v4.1
     default_env:
       CM_SKIP_SYS_UTILS: 'yes'
       CM_REGENERATE_MEASURE_FILES: 'yes'
     env:
       CM_ENV_NVMITTEN_DOCKER_WHEEL_PATH: '/opt/nvmitten-0.1.3b0-cp38-cp38-linux_x86_64.whl'
-
+      CM_MLPERF_INFERENCE_VERSION: '4.1'
 
 invalid_variation_combinations:
   -
@@ -1523,7 +1528,11 @@ docker:
   use_host_user_id: True
   deps:
     - tags: get,mlperf,inference,results,dir,local
+      names:
+      - get-mlperf-inference-results-dir
     - tags: get,mlperf,inference,submission,dir,local
+      names:
+      - get-mlperf-inference-submission-dir
   pre_run_cmds:
     #- cm pull repo && cm run script --tags=get,git,repo,_repo.https://github.com/GATEOverflow/inference_results_v4.0.git --update
     - cm pull repo
@@ -1536,6 +1545,7 @@ docker:
    - "${{ LLAMA2_CHECKPOINT_PATH }}:${{ LLAMA2_CHECKPOINT_PATH }}"
    - "${{ DLRM_DATA_PATH }}:/home/mlperf_inf_dlrmv2"
    - "${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}:${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}"
+   - "${{ SDXL_CHECKPOINT_PATH }}:${{ SDXL_CHECKPOINT_PATH }}"
   skip_run_cmd: 'no'
   shm_size: '32gb'
   interactive: True

diff --git a/script/app-mlperf-inference/customize.py b/script/app-mlperf-inference/customize.py
@@ -206,7 +206,7 @@ def postprocess(i):
         cm_sut_info['device'] = env['CM_MLPERF_DEVICE']
         cm_sut_info['framework'] = state['CM_SUT_META']['framework']
         cm_sut_info['run_config'] = env['CM_MLPERF_INFERENCE_SUT_RUN_CONFIG']
-        with open(os.path.join(result_sut_folder_path,"cm_sut_info.json"), "w") as fp:
+        with open(os.path.join(result_sut_folder_path,"cm-sut-info.json"), "w") as fp:
             json.dump(cm_sut_info, fp, indent=2)
 
         system_meta = state['CM_SUT_META']

diff --git a/script/benchmark-program/customize.py b/script/benchmark-program/customize.py
@@ -81,7 +81,7 @@ def preprocess(i):
     # generate the post run cmd - for killing the process that records runtime system infos
     post_run_cmd = ""
     if env.get('CM_PROFILE_NVIDIA_POWER', '') == "on":
-        post_run_cmd += "echo 'killing process \${cmd_pid}' && kill -TERM \${cmd_pid}"
+        post_run_cmd += "echo killing process \$cmd_pid && kill -TERM \${cmd_pid}"
         print(f"Post run command for killing the process that measures the runtime system information: {post_run_cmd}")
 
     env['CM_POST_RUN_CMD'] = post_run_cmd

diff --git a/script/benchmark-program/run.sh b/script/benchmark-program/run.sh
@@ -1,4 +1,20 @@
 #!/bin/bash
+
+# function to safely exit the background process
+safe_exit() {
+  if [[ "${CM_POST_RUN_CMD}" != "" ]]; then
+    eval ${CM_POST_RUN_CMD}
+    if [ $? -eq 0 ]; then
+      exit 0
+    else
+      exit $?
+    fi
+  fi
+}
+
+# trap signals to redirect the execution flow to safe_exit
+trap safe_exit SIGINT SIGTERM
+
 if [[ ${CM_MLPERF_POWER} == "yes" && ${CM_MLPERF_LOADGEN_MODE} == "performance" ]]; then
     exit 0
 fi
@@ -45,18 +61,10 @@ eval ${CM_PRE_RUN_CMD}
 if [[ "${CM_RUN_CMD0}" != "" ]]; then
   eval ${CM_RUN_CMD0}
   exitstatus=$?
-  if [ -e exitstatus ]; then
-    exitstatus=$( cat exitstatus )
-  fi
-  test $exitstatus -eq 0 || $exitstatus
 else
   echo "${CM_RUN_CMD}"
   eval ${CM_RUN_CMD}
   exitstatus=$?
-  if [ -e exitstatus ]; then
-    exitstatus=$( cat exitstatus )
-  fi
-  test $exitstatus -eq 0 || $exitstatus
 fi
 
 eval ${CM_POST_RUN_CMD}

diff --git a/script/build-mlperf-inference-server-nvidia/_cm.yaml b/script/build-mlperf-inference-server-nvidia/_cm.yaml
@@ -218,7 +218,6 @@ versions:
         version: r3.0
       nvidia-scratch-space:
         tags: _version.3_0
-
   r3.1:
     add_deps_recursive:
       nvidia-inference-common-code:
@@ -237,6 +236,26 @@ versions:
       - tags: install,nccl,libs,_cuda
 
   r4.0:
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        version: r4.0
+      nvidia-scratch-space:
+        tags: _version.4_0
+    default_env:
+      BUILD_TRTLLM: 1
+    deps:
+      - tags: get,generic,sys-util,_nlohmann-json3-dev
+      - tags: get,generic,sys-util,_git-lfs
+      - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v4.0
+        names:
+        - pytorch
+        - torch
+      - tags: install,torchvision,from.src,_for-nvidia-mlperf-inference-v4.0
+        names:
+        - pytorchvision
+        - torchvision
+
+  r4.1-dev:
     add_deps_recursive:
       nvidia-inference-common-code:
         version: r4.0
@@ -255,6 +274,15 @@ versions:
         names:
         - pytorchvision
         - torchvision
+
+  r4.1:
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        version: r4.1
+      nvidia-scratch-space:
+        tags: _version.4_1
+    default_env:
+      BUILD_TRTLLM: 1
 
 docker:
   skip_run_cmd: 'no'

diff --git a/script/build-mlperf-inference-server-nvidia/customize.py b/script/build-mlperf-inference-server-nvidia/customize.py
@@ -21,7 +21,8 @@ def preprocess(i):
     if env.get('CM_GCC_VERSION', '') != '':
         gcc_major_version = env['CM_GCC_VERSION'].split(".")[0]
         if int(gcc_major_version) > 10:
-            cxxflags.append("-Wno-error=range-loop-construct")
+            if env.get('CM_MLPERF_INFERENCE_VERSION','') != "4.1":
+                cxxflags.append("-Wno-error=range-loop-construct")
 
     if env.get('CM_MLPERF_DEVICE','') == "inferentia":
         env['USE_INFERENTIA'] = "1"