Merge branch 'mlperf-inference' into nvidiaAllGhaction

GATEOverflow · Oct 14, 2024 · c8878df · c8878df
2 parents e375b7e + 052dd0d
commit c8878df
Show file tree

Hide file tree

Showing 23 changed files with 197 additions and 42 deletions.
diff --git a/.github/workflows/test-mlperf-inference-dlrm.yml b/.github/workflows/test-mlperf-inference-dlrm.yml
@@ -0,0 +1,48 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: MLPerf inference DLRM-v2
+
+on:
+  schedule:
+    - cron: "30 1 * * *"
+
+jobs:
+  build_reference:
+    if: github.repository_owner == 'gateoverflow'
+    runs-on: [ self-hosted, GO-spr, linux, x64 ]
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.12" ]
+        device: [ "cpu" ]
+
+    steps:
+    - name: Test MLPerf Inference DLRM-v2 reference implementation
+      run: |
+        source gh_action/bin/deactivate || python3 -m venv gh_action
+        source gh_action/bin/activate
+        export CM_REPOS=$HOME/GH_CM
+        python3 -m pip install cm4mlops
+        cm pull repo
+        cm run script --tags=run-mlperf,inference,_performance-only --submitter="MLCommons" --model=dlrm-v2-99 --implementation=reference --backend=pytorch --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --clean
+
+  build_intel:
+    if: github.repository_owner == 'gateoverflow_off'
+    runs-on: [ self-hosted, GO-spr, linux, x64 ]
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.12" ]
+        backend: [ "pytorch" ]
+        device: [ "cpu" ]
+
+    steps:
+    - name: Test MLPerf Inference DLRM-v2 INTEL implementation
+      run: |
+        source gh_action/bin/deactivate || python3 -m venv gh_action
+        source gh_action/bin/activate
+        export CM_REPOS=$HOME/GH_CM
+        python3 -m pip install cm4mlops
+        cm pull repo
+        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=dlrm-v2-99 --implementation=intel --batch_size=1 --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }}  --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml
@@ -5,7 +5,7 @@ name: MLPerf inference GPT-J
 
 on:
   schedule:
-    - cron: "1 2 * * *"
+    - cron: "15 19 * * *"
 
 jobs:
   build:
@@ -19,15 +19,13 @@ jobs:
         precision: [ "float16" ]
 
     steps:
-    - name: Install dependencies
+    - name: Test MLPerf Inference GPTJ
       run: |
         source gh_action/bin/deactivate || python3 -m venv gh_action
         source gh_action/bin/activate
         export CM_REPOS=$HOME/GH_CM
         python3 -m pip install cm4mlops
         cm pull repo
-    - name: Test MLPerf Inference GPTJ
-      run: |
-        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
+        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --get_platform_details=yes --implementation=reference --clean
         cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
 
diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml
@@ -1,8 +1,7 @@
 name: MLPerf inference SDXL
-#off now as we have SCC24 test doing the same
 on:
   schedule:
-    - cron: "1 2 * * *"
+    - cron: "30 9 * * *"
 
 jobs:
   build_reference:
@@ -22,5 +21,5 @@ jobs:
         export CM_REPOS=$HOME/GH_CM
         python3 -m pip install cm4mlops
         cm pull repo
-        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
-        cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
+        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions  --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
+        cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_test_submissions_v5.0 --repo_branch=main --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml
@@ -2,7 +2,7 @@ name: MLPerf inference SDXL (SCC)
 
 on:
   schedule:
-    - cron: "1 3 * * *"
+    - cron: "5 2 * * *"
 
 jobs:
   build_reference:
@@ -52,7 +52,7 @@ jobs:
           pip install --upgrade cm4mlops
           pip install tabulate
           cm pull repo
-          cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --hw_name=go-spr --clean
+          cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --pull_changes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --hw_name=go-spr --custom_system_nvidia=yes --clean
           cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
           cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results
           cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions
diff --git a/script/app-mlperf-inference-intel/_cm.yaml b/script/app-mlperf-inference-intel/_cm.yaml
@@ -940,10 +940,10 @@ variations:
         names:
           - pip-package
           - accelerate
-      - tags: get,generic-python-lib,_package.torch,_path.https://download.pytorch.org/whl/nightly/cpu-cxx11-abi/torch-2.1.0.dev20230715%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl
-        names:
-          - pip-package
-          - pip-torch
+      - tags: install,pytorch,from-src,_for-intel-mlperf-inference-v3.1-dlrm-v2
+        names: 
+         - pytorch
+         - torch
   dlrm-v2_:
     env: {}
 

diff --git a/script/app-mlperf-inference-nvidia/_cm.yaml b/script/app-mlperf-inference-nvidia/_cm.yaml
@@ -242,6 +242,13 @@ deps:
     names:
     - nvidia-inference-common-code
 
+  - tags: pull,git,repo
+    env:
+      CM_GIT_CHECKOUT_PATH: '<<<CM_MLPERF_INFERENCE_NVIDIA_CODE_PATH>>>'
+    enable_if_env:
+      CM_MLPERF_INFERENCE_PULL_CODE_CHANGES:
+      - 'yes'
+
   # Creates user conf for given SUT
   - tags: generate,user-conf,mlperf,inference
     names:

diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml
@@ -111,6 +111,17 @@ deps:
 
 posthook_deps:
   - tags: get,mlperf,sut,description #populate system meta information like framework
+  - tags: get,platform,details
+    enable_if_any_env:
+      CM_SUDO_USER:
+        - yes
+      CM_GET_PLATFORM_DETAILS:
+        - yes
+    skip_if_env:
+      CM_MLPERF_LOADGEN_MODE:
+      - accuracy
+    env: 
+      CM_PLATFORM_DETAILS_FILE_PATH: '<<<CM_MLPERF_OUTPUT_DIR>>>/system_info.txt'
 
 # Order of variations for documentation
 variation_groups_order:
@@ -985,19 +996,24 @@ variations:
     docker:
       deps:
         - tags: get,dlrm,data,mlperf,inference,_nvidia
+      mounts:
+        - "${{ DLRM_DATA_PATH }}:/home/mlperf_inf_dlrmv2"
 
   dlrm_,intel:
     docker:
       deps:
         - tags: get,preprocessed,dataset,criteo,_mlc
+      mounts:
+        - "${{ DLRM_DATA_PATH }}:${{ DLRM_DATA_PATH }}"
 
   dlrm_,reference:
     docker:
       deps:
         - tags: get,preprocessed,dataset,criteo,_mlc
         - tags: get,ml-model,dlrm,_pytorch,_fp32
       mounts:
-        - ${{ CM_ML_MODEL_FILE_WITH_PATH }}:${{ CM_ML_MODEL_FILE_WITH_PATH }}
+        - "${{ CM_ML_MODEL_FILE_WITH_PATH }}:${{ CM_ML_MODEL_FILE_WITH_PATH }}"
+        - "${{ DLRM_DATA_PATH }}:${{ DLRM_DATA_PATH }}"
       dockerfile_env:
         CM_ML_MODEL_FILE_WITH_PATH: "on"
 
@@ -1626,7 +1642,6 @@ docker:
    - "${{ GPTJ_CHECKPOINT_PATH }}:${{ GPTJ_CHECKPOINT_PATH }}"
    - "${{ CM_CRITEO_PREPROCESSED_PATH }}:${{ CM_CRITEO_PREPROCESSED_PATH }}"
    - "${{ LLAMA2_CHECKPOINT_PATH }}:${{ LLAMA2_CHECKPOINT_PATH }}"
-   - "${{ DLRM_DATA_PATH }}:/home/mlperf_inf_dlrmv2"
    - "${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}:${{ CM_NVIDIA_LLAMA_DATASET_FILE_PATH }}"
    - "${{ SDXL_CHECKPOINT_PATH }}:${{ SDXL_CHECKPOINT_PATH }}"
    - "${{ CM_DATASET_KITS19_PREPROCESSED_PATH }}:${{ CM_DATASET_KITS19_PREPROCESSED_PATH }}"

diff --git a/script/clean-nvidia-mlperf-inference-scratch-space/customize.py b/script/clean-nvidia-mlperf-inference-scratch-space/customize.py
@@ -25,6 +25,9 @@ def preprocess(i):
         if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'preprocessed_data':
             clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "preprocessed_data", "coco2014-tokenized-sdxl")} """
             cache_rm_tags  = "nvidia-harness,_preprocess_data,_sdxl"
+        if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'downloaded_model':
+            clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "models", "SDXL")} """
+            cache_rm_tags  = "nvidia-harness,_download_model,_sdxl"
 
     cache_rm_tags = cache_rm_tags + extra_cache_rm_tags
 

diff --git a/script/generate-mlperf-inference-submission/customize.py b/script/generate-mlperf-inference-submission/customize.py
@@ -239,8 +239,11 @@ def generate_submission(i):
 
         results = {}
 
+        model_platform_info_file = None
+
         for model in models:
             results[model] = {}
+            platform_info_file = None
             result_model_path = os.path.join(result_path, model)
             submission_model_path = os.path.join(submission_path, model)
             measurement_model_path = os.path.join(measurement_path, model)
@@ -386,8 +389,10 @@ def generate_submission(i):
                                 files.append(f)
                             elif f == "spl.txt":
                                 files.append(f)
-                            elif f in [ "README.md", "README-extra.md", "cm-version-info.json", "os_info.json", "cpu_info.json", "pip_freeze.json" ] and mode == "performance":
+                            elif f in [ "README.md", "README-extra.md", "cm-version-info.json", "os_info.json", "cpu_info.json", "pip_freeze.json", "system_info.txt" ] and mode == "performance":
                                 shutil.copy(os.path.join(result_mode_path, f), os.path.join(submission_measurement_path, f))
+                                if f == "system_info.txt" and not platform_info_file:
+                                    platform_info_file = os.path.join(result_mode_path, f)
                             elif f in [ "console.out" ]:
                                 shutil.copy(os.path.join(result_mode_path, f), os.path.join(submission_measurement_path, mode+"_"+f))
 
@@ -417,6 +422,26 @@ def generate_submission(i):
                     with open(readme_file, mode='a') as f:
                         f.write(result_string)
 
+            #Copy system_info.txt to the submission measurements model folder if any scenario performance run has it
+            sys_info_file = None
+            if os.path.exists(os.path.join(result_model_path, "system_info.txt")):
+                sys_info_file = os.path.join(result_model_path, "system_info.txt")
+            elif platform_info_file:
+                sys_info_file = platform_info_file
+            if sys_info_file:
+                model_platform_info_file = sys_info_file
+                shutil.copy(sys_info_file, os.path.join(measurement_model_path, "system_info.txt"))
+
+        #Copy system_info.txt to the submission measurements folder if any model performance run has it
+        sys_info_file = None
+        if os.path.exists(os.path.join(result_path, "system_info.txt")):
+            sys_info_file = os.path.join(result_path, "system_info.txt")
+        elif model_platform_info_file:
+            sys_info_file = model_platform_info_file
+        if sys_info_file:
+            shutil.copy(sys_info_file, os.path.join(measurement_path, "system_info.txt"))
+
+
         with open(system_file, "w") as fp:
             json.dump(system_meta, fp, indent=2)
 

diff --git a/script/get-cuda-devices/customize.py b/script/get-cuda-devices/customize.py
@@ -53,8 +53,8 @@ def postprocess(i):
             key_env = 'CM_CUDA_DEVICE_PROP_'+key.upper().replace(' ','_')
             env[key_env] = val
 
-    state['cm_cuda_num_devices'] = gpu_id
-    env['CM_CUDA_NUM_DEVICES'] = gpu_id
+    state['cm_cuda_num_devices'] = gpu_id + 1
+    env['CM_CUDA_NUM_DEVICES'] = gpu_id + 1
 
     state['cm_cuda_device_prop'] = p
     state['cm_cuda_devices_prop'] = gpu

diff --git a/script/get-generic-python-lib/_cm.json b/script/get-generic-python-lib/_cm.json
@@ -874,12 +874,26 @@
       "deps": [
         {
           "tags": "get,generic-python-lib,_package.networkx",
-	  "enable_if_env": {
-	    "CM_PYTHON_MINOR_VERSION": [ "7", "8" ]
-	  }
+	        "enable_if_env": {
+	          "CM_PYTHON_MINOR_VERSION": [ "7", "8" ]
+	        }
         }
       ]
     },
+    "cxx11-abi": {
+      "env": {
+      }
+    },
+    "torch,cxx11-abi": {
+      "env": {
+        "CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL": "https://download.pytorch.org/whl/nightly/cpu-cxx11-abi"
+      }
+    },
+    "package.torch,cxx11-abi": {
+      "env": {
+        "CM_GENERIC_PYTHON_PIP_INDEX_URL": "https://download.pytorch.org/whl/nightly/cpu-cxx11-abi"
+      }
+    },
     "torch,pre": {
       "default_env": {
         "CM_GENERIC_PYTHON_PIP_UNINSTALL_DEPS": "torch"

diff --git a/script/get-generic-python-lib/customize.py b/script/get-generic-python-lib/customize.py
@@ -78,6 +78,7 @@ def preprocess(i):
 
             # Check extra index URL
             extra_index_url = env.get('CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL','').strip()
+
             if extra_index_url != '':
                 # Check special cases
                 if '${CM_TORCH_CUDA}' in extra_index_url:

diff --git a/script/get-platform-details/customize.py b/script/get-platform-details/customize.py
@@ -16,14 +16,15 @@ def preprocess(i):
     if os_info['platform'] == "windows":
         return {'return':1, 'error':'get-platform-details script not yet supported in windows!'}
 
-    print(env['CM_HOST_OS_KERNEL_VERSION'])
-
     if not check_installation("numactl",os_info):
         env['CM_INSTALL_NUMACTL'] = 'True'
 
     #if not check_installation("cpupower",os_info):
     env['CM_INSTALL_CPUPOWER'] = 'True'
 
+    if env.get('CM_PLATFORM_DETAILS_FILE_PATH', '') == '':
+       env['CM_PLATFORM_DETAILS_FILE_PATH'] = os.path.join(os.getcwd(), "system_info.txt")
+
     return {'return':0}
 
 

diff --git a/script/get-platform-details/run.sh b/script/get-platform-details/run.sh
@@ -1,8 +1,9 @@
 #!/bin/bash
 
-OUTPUT_FILE="system_info.txt"
-
-echo "WARNING: sudo permission is needed to some packages for measuring the platform details"
+OUTPUT_FILE="$CM_PLATFORM_DETAILS_FILE_PATH"
+#set -e
+#echo $OUTPUT_FILE
+echo "WARNING: sudo permission is needed for some of the below commands"
 
 if [[ ${CM_HOST_OS_FLAVOR} == "macos" ]]; then
     echo "WARNING: To be done for the mac os"
@@ -46,7 +47,7 @@ else
 
     echo "8. numactl --hardware" >> $OUTPUT_FILE
     eval "numactl --hardware" >> $OUTPUT_FILE
-    test $? -eq 0 || exit $?
+    #test $? -eq 0 || exit $?
     echo "------------------------------------------------------------" >> $OUTPUT_FILE
 
     echo "9. /proc/meminfo" >> $OUTPUT_FILE
@@ -81,7 +82,7 @@ else
 
     echo "15. sysctl" >> $OUTPUT_FILE
     eval "sudo sysctl -a" >> $OUTPUT_FILE
-    test $? -eq 0 || exit $?
+    #test $? -eq 0 || exit $?
     echo "------------------------------------------------------------" >> $OUTPUT_FILE
 
     echo "16. /sys/kernel/mm/transparent_hugepage" >> $OUTPUT_FILE
@@ -111,12 +112,12 @@ else
 
     echo "21. dmidecode" >> $OUTPUT_FILE
     eval "sudo dmidecode" >> $OUTPUT_FILE
-    test $? -eq 0 || exit $?
+    #test $? -eq 0 || exit $?
     echo "------------------------------------------------------------" >> $OUTPUT_FILE
 
     echo "22. BIOS" >> $OUTPUT_FILE
     eval "sudo dmidecode -t bios" >> $OUTPUT_FILE
-    test $? -eq 0 || exit $?
+    #test $? -eq 0 || exit $?
     echo "------------------------------------------------------------" >> $OUTPUT_FILE
 
     echo "System information has been saved to $PWD/$OUTPUT_FILE"

diff --git a/script/install-ipex-from-src/_cm.json b/script/install-ipex-from-src/_cm.json
@@ -214,10 +214,10 @@
           "version": "1.23.5"
         },
         {
-           "tags": "get,generic-python-lib,_package.torch,_path.https://download.pytorch.org/whl/nightly/cpu-cxx11-abi/torch-2.1.0.dev20230715%2Bcpu.cxx11.abi-cp39-cp39-linux_x86_64.whl",
+           "tags": "install,pytorch,from-src,_for-intel-mlperf-inference-v3.1-dlrm-v2",
            "names": [
-             "pip-package",
-             "pip-torch"
+             "pytorch",
+             "torch"
            ]
         }
       ],