Merge pull request #19 from arjunsuresh/mlperf-inference

Fixes for MLPerf inference v4.0
GATEOverflow · May 21, 2024 · e6e1d9a · e6e1d9a
2 parents 03eaf74 + 31f31c3
commit e6e1d9a
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 11 deletions.
diff --git a/automation/script/module.py b/automation/script/module.py
@@ -975,7 +975,7 @@ def _run(self, i):
                 if str(state['docker'].get('run', True)).lower() in ['false', '0', 'no']:
                     print (recursion_spaces+'  - Skipping script::{} run as we are inside docker'.format(found_script_artifact))
                     return {'return': 0}
-                elif str(state['docker'].get('docker_real_run', True)).lower() in ['false', '0', 'no']:
+                elif str(state['docker'].get('real_run', True)).lower() in ['false', '0', 'no']:
                     print (recursion_spaces+'  - Doing fake run for script::{} as we are inside docker'.format(found_script_artifact))
                     fake_run = True
                     env['CM_TMP_FAKE_RUN']='yes'

diff --git a/script/add-custom-nvidia-system/_cm.yaml b/script/add-custom-nvidia-system/_cm.yaml
@@ -6,7 +6,8 @@ automation_alias: script
 automation_uid: 5b4e0237da074764
 
 category: "MLPerf benchmark support"
-
+docker:
+  real_run: False
 
 # User-friendly tags to find this CM script
 tags:
@@ -94,6 +95,13 @@ variations:
     add_deps_recursive:
       nvidia-inference-common-code:
         tags: _ctuning
+  go:
+    group: code
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        tags: _go
+
+
 
 
 versions:
@@ -111,3 +119,8 @@ versions:
     add_deps_recursive:
       nvidia-inference-common-code:
         version: r3.1
+
+  r4.0:
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        version: r4.0
diff --git a/script/app-mlperf-inference-nvidia/customize.py b/script/app-mlperf-inference-nvidia/customize.py
@@ -10,6 +10,9 @@ def preprocess(i):
         return {'return':1, 'error': 'Windows is not supported in this script yet'}
     env = i['env']
 
+    if str(env.get('CM_RUN_STATE_DOCKER', '')).lower() in ['1', 'true', 'yes']:
+        return {'return': 0}
+
     if env.get('CM_MODEL', '') == '':
         return {'return': 1, 'error': 'Please select a variation specifying the model to run'}
 

diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml
@@ -280,12 +280,6 @@ variations:
       CM_IMAGENET_ACCURACY_DTYPE: int32
       CM_CNNDM_ACCURACY_DTYPE: int32
       CM_LIBRISPEECH_ACCURACY_DTYPE: int8
-    deps:
-      - tags: get,cuda-devices
-        skip_if_env:
-          CM_CUDA_DEVICE_PROP_GLOBAL_MEMORY:
-            - "yes"
-            - "on"
     prehook_deps:
       - names:
          - nvidia-original-mlperf-inference
@@ -911,6 +905,12 @@ variations:
     add_deps_recursive:
       mlperf-inference-implementation:
         tags: _cuda
+    deps:
+      - tags: get,cuda-devices
+        skip_if_env:
+          CM_CUDA_DEVICE_PROP_GLOBAL_MEMORY:
+            - "yes"
+            - "on"
   rocm:
     docker:
       all_gpus: 'yes'
@@ -1266,6 +1266,7 @@ docker:
     - tags: get,mlperf,inference,results,dir
     - tags: get,mlperf,inference,submission,dir
   pre_run_cmds:
+    #- cm pull repo && cm run script --tags=get,git,repo,_repo.https://github.com/GATEOverflow/inference_results_v4.0.git --update
     - cm pull repo
   mounts:
    - "${{ CM_DATASET_IMAGENET_PATH }}:${{ CM_DATASET_IMAGENET_PATH }}"

diff --git a/script/build-mlperf-inference-server-nvidia/_cm.yaml b/script/build-mlperf-inference-server-nvidia/_cm.yaml
@@ -201,7 +201,6 @@ variations:
         names:
         - pytorchvision
         - torchvision
-      - tags: install,nccl,libs,_cuda
 
 versions:
   r2.1:
@@ -250,7 +249,6 @@ versions:
         names:
         - pytorchvision
         - torchvision
-      - tags: install,nccl,libs,_cuda
 
 docker:
   skip_run_cmd: 'no'

diff --git a/script/get-git-repo/_cm.json b/script/get-git-repo/_cm.json
@@ -17,6 +17,7 @@
     "folder": "CM_GIT_CHECKOUT_FOLDER",
     "patch": "CM_GIT_PATCH",
     "update": "CM_GIT_REPO_PULL",
+    "pull": "CM_GIT_REPO_PULL",
     "env_key": "CM_GIT_ENV_KEY",
     "submodules": "CM_GIT_RECURSE_SUBMODULES"
   },

diff --git a/script/install-torchvision-from-src/_cm.json b/script/install-torchvision-from-src/_cm.json
@@ -95,7 +95,17 @@
       }
     },
     "for-nvidia-mlperf-inference-v4.0": {
-      "alias": "for-nvidia-mlperf-inference-v3.1"
+      "base": [
+        "sha.657027f3",
+        "cuda"
+      ],
+      "deps": [
+        {
+          "tags": "install,pytorch,from.src,_for-nvidia-mlperf-inference-v4.0"
+        }
+      ],
+      "env": {
+      }
     },
     "for-nvidia-mlperf-inference-v3.1": {
       "base": [