mlcommons · arjunsuresh · May 20, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
@@ -27,7 +27,7 @@ See the [automatically generated catalog](https://github.com/mlcommons/ck/blob/m
 When we run a CM script we can also pass inputs to it and any input added in `input_mapping` dictionary inside `_cm.json` gets converted to the corresponding `ENV` variable.
 
 ### Conditional execution of any `deps`, `post_deps`
-We can use `skip_if_env` dictionary inside any `deps`, `prehook_deps`, `posthook_deps` or `post_deps` to make its executional conditional
+We can use `skip_if_env` dictionary inside any `deps`, `prehook_deps`, `posthook_deps` or `post_deps` to make its execution conditional
 
 ### Versions
 We can specify any specific version of a script using `version`. `version_max` and `version_min` are also possible options. 
@@ -73,9 +73,7 @@ Sometimes it is difficult to add all variations needed for a script like say `ba
 
 ### Script workflow (env, deps, native scripts)
 
-![](assets/scripts-workflow.png)
+<img src="https://github.com/mlcommons/cm4mlops/raw/mlperf-inference/automation/script/assets/scripts-workflow.png" width="248">
 
 
-
-
-&copy; 2022-23 [MLCommons](https://mlcommons.org)<br>
+&copy; 2022-24 [MLCommons](https://mlcommons.org)<br>
@@ -4314,19 +4314,24 @@ def enable_or_skip_script(meta, env):
     (AND function)
     """
     for key in meta:
+        meta_key = [str(v).lower() for v in meta[key]]
         if key in env:
             value = str(env[key]).lower()
 
-            meta_key = [str(v).lower() for v in meta[key]]
-
             if set(meta_key) & set(["yes", "on", "true", "1"]):
+                # Any set value other than false is taken as set
                 if value not in ["no", "off", "false", "0"]:
                     continue
             elif set(meta_key) & set(["no", "off", "false", "0"]):
                 if value in ["no", "off", "false", "0"]:
                     continue
             elif value in meta_key:
                 continue
+        else:
+            if set(meta_key) & set(["no", "off", "false", "0"]):
+                # If key is missing in env, and if the expected value is False, consider it a match
+                continue
+
         return False
 
     return True

@@ -1873,7 +1873,7 @@ def docker(i):
                 dockerfilename_suffix = dockerfilename_suffix[len(dockerfilename_suffix) - 1]
 
 
-        cm_repo=i.get('docker_cm_repo', 'mlcommons@cm4mlops')
+        cm_repo=i.get('docker_cm_repo', docker_settings.get('cm_repo', 'mlcommons@cm4mlops'))
 
         docker_path = i.get('docker_path', '').strip()
         if docker_path == '': 

@@ -293,7 +293,7 @@ variations:
         adr:
           conda-python:
             version: "3.9"
-      - tags: install,llvm,src,_tag.llvmorg-16.0.6,_clang,_release,_for-intel-mlperf-inference-v3.1-gptj
+      - tags: install,llvm,src,_for-intel-mlperf-inference-v3.1-gptj
       - names:
         - conda-package
         - ncurses

@@ -1,6 +1,8 @@
 #!/bin/bash
 export PATH=${CM_CONDA_BIN_PATH}:$PATH
 
+KMP_BLOCKTIME=${KMP_BLOCKTIME:-10}
+
 export KMP_BLOCKTIME=${KMP_BLOCKTIME}
 export KMP_AFFINITY=granularity=fine,compact,1,0
 export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so
@@ -9,11 +11,11 @@ export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
 export num_physical_cores=`lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l`
 num_numa=$(numactl --hardware|grep available|awk -F' ' '{ print $2 }')
 
-NUM_PROC=${NUM_PROC:-num_numa}
+NUM_PROC=${NUM_PROC:-$num_numa}
 CPUS_PER_PROC=$((num_physical_cores/num_numa))
-WORKERS_PER_PROC=${WORKERS_PER_PROC}
+WORKERS_PER_PROC=${WORKERS_PER_PROC:-1}
 TOTAL_SAMPLE_COUNT=13368
-BATCH_SIZE=${CM_MLPERF_LOADGEN_BATCH_SIZE}
+BATCH_SIZE=${CM_MLPERF_LOADGEN_BATCH_SIZE:-8}
 TIMESTAMP=$(date +%m-%d-%H-%M)
 HOSTNAME=$(hostname)
 #OUTPUT_DIR=offline-output-${HOSTNAME}-batch-${BATCH_SIZE}-procs-${NUM_PROC}-ins-per-proc-${WORKERS_PER_PROC}-${TIMESTAMP}
@@ -28,7 +30,7 @@ USER_CONF="${CM_MLPERF_USER_CONF}"
 
 
 cmd="python runner.py --workload-name gptj \
-	--scenario ${${CM_MLPERF_LOADGEN_SCENARIO}} \
+	--scenario ${CM_MLPERF_LOADGEN_SCENARIO} \
 	--mode ${LOADGEN_MODE} \
 	--num-proc ${NUM_PROC} \
 	--cpus-per-proc ${CPUS_PER_PROC} \

@@ -104,6 +104,8 @@ deps:
 
   # Detect CUDA if required
   - tags: get,cuda,_cudnn
+    names:
+      - cuda
     enable_if_env:
       CM_MLPERF_DEVICE:
       - gpu

@@ -177,7 +177,7 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio
 
         env['RUN_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "language", "gpt-j")
         cmd = env['CM_PYTHON_BIN_WITH_PATH'] +  \
-            " main.py --model-path=" + env['CM_ML_MODEL_FILE_WITH_PATH'] + ' --dataset-path=' + env['CM_DATASET_EVAL_PATH'] + " --scenario " + env['CM_MLPERF_LOADGEN_SCENARIO'] + " " + env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] + \
+            " run.py --model-path=" + env['CM_ML_MODEL_FILE_WITH_PATH'] + ' --dataset-path=' + env['CM_DATASET_EVAL_PATH'] + " --scenario " + env['CM_MLPERF_LOADGEN_SCENARIO'] + " " + env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] + \
             ' --dtype ' + env['CM_MLPERF_MODEL_PRECISION'] + \
             scenario_extra_options + mode_extra_options + dataset_options
         cmd = cmd.replace("--count", "--max_examples")
@@ -188,7 +188,6 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio
             gpu_options = ""
         cmd = cmd + gpu_options
         env['LOG_PATH'] = env['CM_MLPERF_OUTPUT_DIR']
-        return cmd, env['RUN_DIR']
 
     if env['CM_MODEL'] in [ "resnet50", "retinanet" ]:
 

@@ -261,7 +261,10 @@ deps:
       CM_MLPERF_NVIDIA_HARNESS_RUN_MODE:
       - run_harness
 
-  - tags: get,generic-python-lib,_package.nvmitten,_path./opt/nvmitten-0.1.3-cp38-cp38-linux_x86_64.whl
+  - tags: get,generic-python-lib,_package.nvmitten
+    update_tags_from_env_with_prefix:
+      _path.:
+        - CM_ENV_NVMITTEN_DOCKER_WHEEL_PATH
     enable_if_env:
       CM_RUN_STATE_DOCKER:
         - 'yes'
@@ -338,6 +341,7 @@ variations:
       CM_ML_MODEL_WEIGHTS_DATA_TYPE: int8
     deps:
     - tags: get,generic-python-lib,_onnx-graphsurgeon
+      version: 0.3.27
     - tags: get,generic-python-lib,_package.onnx
       version: 1.13.1
 

@@ -182,13 +182,16 @@ variations:
         tags: _float32
       librispeech-accuracy-script:
         tags: _int32
+      cnndm-accuracy-script:
+        tags: _int32
     env:
       CM_MLPERF_PYTHON: 'yes'
       CM_MLPERF_IMPLEMENTATION: mlcommons_python
       CM_SQUAD_ACCURACY_DTYPE: float32
       CM_IMAGENET_ACCURACY_DTYPE: float32
       CM_OPENIMAGES_ACCURACY_DTYPE: float32
       CM_LIBRISPEECH_ACCURACY_DTYPE: float32
+      CM_CNNDM_ACCURACY_DTYPE: int32
     prehook_deps:
       - names:
          - python-reference-mlperf-inference
@@ -235,6 +238,10 @@ variations:
     default_variations:
       backend: onnxruntime
 
+  nvidia-original,r4.1_default:
+    docker:
+      base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v4.0-cuda12.2-cudnn8.9-x86_64-ubuntu20.04-public
+
   nvidia-original:
     docker:
       interactive: True
@@ -430,7 +437,7 @@ variations:
       tags: run,accuracy,mlperf,_imagenet
     docker:
       deps:
-      - tags: get,dataset,imagenet,original
+      - tags: get,dataset,imagenet,validation,original
         names:
           - imagenet-original
           - dataset-original
@@ -1142,6 +1149,25 @@ variations:
     default_env:
       CM_SKIP_SYS_UTILS: 'yes'
       CM_REGENERATE_MEASURE_FILES: 'yes'
+    env:
+      CM_ENV_NVMITTEN_DOCKER_WHEEL_PATH: '/opt/nvmitten-0.1.3-cp38-cp38-linux_x86_64.whl'
+
+  r4.1_default:
+    group:
+      reproducibility
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        version: r4.0
+        tags: _go
+      nvidia-inference-server:
+        version: r4.0
+        tags: _go
+    default_env:
+      CM_SKIP_SYS_UTILS: 'yes'
+      CM_REGENERATE_MEASURE_FILES: 'yes'
+    env:
+      CM_ENV_NVMITTEN_DOCKER_WHEEL_PATH: '/opt/nvmitten-0.1.3b0-cp38-cp38-linux_x86_64.whl'
+
 
 invalid_variation_combinations:
   -
@@ -1250,10 +1276,10 @@ docker:
   shm_size: '32gb'
   interactive: True
   extra_run_args: ' --ulimit memlock=-1 --cap-add SYS_ADMIN --cap-add SYS_TIME --security-opt apparmor=unconfined --security-opt seccomp=unconfined'
-  docker_os: ubuntu
-  docker_cm_repo: gateoverflow@cm4mlops
-  docker_real_run: False
-  docker_os_version: '22.04'
+  os: ubuntu
+  cm_repo: gateoverflow@cm4mlops
+  real_run: False
+  os_version: '22.04'
   docker_input_mapping:
     imagenet_path: IMAGENET_PATH
     gptj_checkpoint_path: GPTJ_CHECKPOINT_PATH

@@ -46,8 +46,8 @@ def postprocess(i):
     env['CMD'] = ''
     state = i['state']
 
-    if env.get('CM_MLPERF_USER_CONF', '') == '':
-        return {'return': 0}
+    #if env.get('CM_MLPERF_USER_CONF', '') == '':
+    #    return {'return': 0}
 
     output_dir = env['CM_MLPERF_OUTPUT_DIR']
     mode = env['CM_MLPERF_LOADGEN_MODE']
@@ -254,16 +254,16 @@ def postprocess(i):
         if env.get('CM_HOST_SYSTEM_NAME','')!='': host_info['system_name']=env['CM_HOST_SYSTEM_NAME']
 
         # Check CM automation repository
-        repo_name = 'mlcommons@ck'
+        repo_name = 'mlcommons@cm4mlops'
         repo_hash = ''
-        r = cm.access({'action':'find', 'automation':'repo', 'artifact':'mlcommons@ck,a4705959af8e447a'})
+        r = cm.access({'action':'find', 'automation':'repo', 'artifact':'mlcommons@cm4mlops,9e97bb72b0474657'})
         if r['return']==0 and len(r['list'])==1:
             repo_path = r['list'][0].path
             if os.path.isdir(repo_path):
                 repo_name = os.path.basename(repo_path)
 
-                # Check Grigori's dev
-                if repo_name == 'ck': repo_name = 'ctuning@mlcommons-ck'
+                # Check dev
+                if repo_name == 'cm4mlops': repo_name = 'gateoverflow@cm4mlops'
 
                 r = cm.access({'action':'system',
                                'automation':'utils',
@@ -275,54 +275,6 @@ def postprocess(i):
                     host_info['cm_repo_name'] = repo_name
                     host_info['cm_repo_git_hash'] = repo_hash
 
-        # Check a few important MLCommons repos
-        xhashes = []
-        md_xhashes = ''
-
-        for x in [('get,git,inference', ['inference']),
-                  ('get,git,mlperf,power', ['power-dev'])]:
-            xtags = x[0]
-            xdirs = x[1]
-
-            rx = cm.access({'action':'find', 'automation':'cache', 'tags':xtags})
-            if rx['return']>0: return rx
-            for cache in rx['list']:
-                xurl = ''
-                xhash = ''
-
-                for xd in xdirs:
-                    xpath = os.path.join(cache.path, xd)
-                    if os.path.isdir(xpath):
-                        r = cm.access({'action':'system', 'automation':'utils', 'path':xpath, 'cmd':'git rev-parse HEAD'})
-                        if r['return'] == 0 and r['ret'] == 0:
-                            xhash = r['stdout']
-
-                        r = cm.access({'action':'system', 'automation':'utils', 'path':xpath, 'cmd':'git config --get remote.origin.url'})
-                        if r['return'] == 0 and r['ret'] == 0:
-                            xurl = r['stdout']
-
-                    if xurl!='' and xhash!='':
-                        break
-
-                if xurl!='' and xhash!='':
-                    # Check if doesn't exist
-                    found = False
-
-                    for xh in xhashes:
-                        if xh['mlcommons_git_url'] == xurl and xh['mlcommons_git_hash'] == xhash:
-                            found = True
-                            break
-
-                    if not found:
-                        xhashes.append({'mlcommons_git_url': xurl,
-                                        'mlcommons_git_hash': xhash,
-                                        'cm_cache_tags':cache.meta['tags']})
-
-                        md_xhashes +='* MLCommons Git {} ({})\n'.format(xurl, xhash)
-
-        if len(xhashes)>0:
-            host_info['mlcommons_repos'] = xhashes
-
         with open ("cm-host-info.json", "w") as fp:
             fp.write(json.dumps(host_info, indent=2)+'\n')
 
@@ -336,10 +288,10 @@ def postprocess(i):
 
         readme_init = "This experiment is generated using the [MLCommons Collective Mind automation framework (CM)](https://github.com/mlcommons/ck).\n\n"
 
-        readme_init+= "*Check [CM MLPerf docs](https://github.com/mlcommons/ck/tree/master/docs/mlperf) for more details.*\n\n"
+        readme_init+= "*Check [CM MLPerf docs](https://mlcommons.github.io/inference) for more details.*\n\n"
 
-        readme_body = "## Host platform\n\n* OS version: {}\n* CPU version: {}\n* Python version: {}\n* MLCommons CM version: {}\n{}\n\n".format(platform.platform(), 
-            platform.processor(), sys.version, cm.__version__, md_xhashes)
+        readme_body = "## Host platform\n\n* OS version: {}\n* CPU version: {}\n* Python version: {}\n* MLCommons CM version: {}\n\n".format(platform.platform(), 
+            platform.processor(), sys.version, cm.__version__)
 
         x = repo_name
         if repo_hash!='': x+=' --checkout='+str(repo_hash)

@@ -164,6 +164,11 @@ variations:
     add_deps_recursive:
       nvidia-inference-common-code:
         tags: _ctuning
+  go:
+    group: code
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        tags: _go
   nvidia-only:
     group: code
     add_deps_recursive:
@@ -180,6 +185,23 @@ variations:
       nvidia-inference-common-code:
         tags: _mlcommons
 
+  r4.0:
+    group: version
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        version: r4.0
+      nvidia-scratch-space:
+        tags: _version.4_1
+    deps:
+      - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v4.0
+        names:
+        - pytorch
+        - torch
+      - tags: install,torchvision,from.src,_for-nvidia-mlperf-inference-v4.0
+        names:
+        - pytorchvision
+        - torchvision
+      - tags: install,nccl,libs,_cuda
 
 versions:
   r2.1:
@@ -213,16 +235,33 @@ versions:
         - torchvision
       - tags: install,nccl,libs,_cuda
 
+  r4.0:
+    add_deps_recursive:
+      nvidia-inference-common-code:
+        version: r4.0
+      nvidia-scratch-space:
+        tags: _version.4_1
+    deps:
+      - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v4.0
+        names:
+        - pytorch
+        - torch
+      - tags: install,torchvision,from.src,_for-nvidia-mlperf-inference-v4.0
+        names:
+        - pytorchvision
+        - torchvision
+      - tags: install,nccl,libs,_cuda
+
 docker:
   skip_run_cmd: 'no'
   all_gpus: 'yes'
   shm_size: '32gb'
   extra_run_args: ' --ulimit memlock=-1 --cap-add SYS_ADMIN --cap-add SYS_TIME --security-opt apparmor=unconfined --security-opt seccomp=unconfined'
-  docker_os: ubuntu
+  os: ubuntu
   cm_repo_flags1: ' --branch=mlperf-inference'
-  docker_real_run: False
+  real_run: False
   interactive: True
-  docker_os_version: '20.04'
+  os_version: '20.04'
   base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v3.1-cuda12.2-cudnn8.9-x86_64-ubuntu20.04-l4-public
   docker_input_mapping:
     imagenet_path: IMAGENET_PATH