Merge pull request #43 from mlcommons/dev

Dev
mlcommons · May 4, 2024 · f4c6fb5 · f4c6fb5
2 parents 1e6f335 + 873c577
commit f4c6fb5
Show file tree

Hide file tree

Showing 41 changed files with 290 additions and 431 deletions.
diff --git a/automation/script/module_misc.py b/automation/script/module_misc.py
@@ -1889,6 +1889,8 @@ def docker(i):
 
         all_gpus = i.get('docker_all_gpus', docker_settings.get('all_gpus'))
 
+        num_gpus = i.get('docker_num_gpus', docker_settings.get('num_gpus'))
+
         device = i.get('docker_device', docker_settings.get('device'))
 
         r = check_gh_token(i, docker_settings, quiet)
@@ -1983,6 +1985,9 @@ def docker(i):
         if all_gpus:
             cm_docker_input['all_gpus'] = True
 
+        if num_gpus:
+            cm_docker_input['num_gpus'] = str(num_gpus)
+
         if device:
             cm_docker_input['device'] = device
 

diff --git a/automation/utils/module_cfg.py b/automation/utils/module_cfg.py
@@ -282,14 +282,19 @@ def select_cfg(i):
             meta = r['meta']
             ss['meta'] = meta
 
+        selector = sorted(selector, key = lambda x: x['meta'].get('name',''))
+        s = 0
+        for ss in selector:
             alias = ss['alias']
-            name = meta.get('name','')
+            name = ss['meta'].get('name','')
 
             x = name
             if x!='': x+=' '
             x += '('+alias+')'
 
             print ('{}) {}'.format(s, x))
+
+            s+=1
 
         print ('')
         select = input ('Enter configuration number of press Enter for 0: ')

diff --git a/...docker-basic-configurations/nvidia-ubuntu-20.04-cuda-11.8-cudnn-8.6.0-pytorch-1.13.0.yaml b/...docker-basic-configurations/nvidia-ubuntu-20.04-cuda-11.8-cudnn-8.6.0-pytorch-1.13.0.yaml
@@ -0,0 +1,9 @@
+uid: 854e65fb31584d63
+
+name: "Nvidia Ubuntu 20.04 CUDA 11.8 cuDNN 8.6.0 PyTorch 1.13.0"
+
+input:
+  docker_base_image: 'nvcr.io/nvidia/pytorch:22.10-py3'
+  docker_os: ubuntu
+  docker_os_version: '20.04'
+
diff --git a/cfg/docker-basic-configurations/nvidia-ubuntu-22.04-cuda-12.1-cudnn-8.9.1-pytorch-2.0.0.yaml b/cfg/docker-basic-configurations/nvidia-ubuntu-22.04-cuda-12.1-cudnn-8.9.1-pytorch-2.0.0.yaml
@@ -0,0 +1,9 @@
+uid: e0e7167139a74e36
+
+name: "Nvidia Ubuntu 22.04 CUDA 12.1 cuDNN 8.9.1 PyTorch 2.0.0"
+
+input:
+  docker_base_image: 'nvcr.io/nvidia/pytorch:23.05-py3'
+  docker_os: ubuntu
+  docker_os_version: '22.04'
+
diff --git a/cfg/docker-basic-configurations/nvidia-ubuntu-22.04-cuda-12.4-cudnn-9.0.0-pytorch-2.3.0.yaml b/cfg/docker-basic-configurations/nvidia-ubuntu-22.04-cuda-12.4-cudnn-9.0.0-pytorch-2.3.0.yaml
@@ -0,0 +1,9 @@
+uid: 49fc51f2999b4545
+
+name: "Nvidia Ubuntu 22.04 CUDA 12.4 cuDNN 9.0.0 PyTorch 2.3.0"
+
+input:
+  docker_base_image: 'nvcr.io/nvidia/pytorch:24.03-py3'
+  docker_os: ubuntu
+  docker_os_version: '22.04'
+
diff --git a/script/app-image-classification-onnx-py/_cm.yaml b/script/app-image-classification-onnx-py/_cm.yaml
@@ -117,5 +117,3 @@ docker:
     - env.CM_IMAGE
     - output
     - j
-  pre_run_cmds:
-    - echo \"CM pre run commands\"
diff --git a/script/app-loadgen-generic-python/README-extra.md b/script/app-loadgen-generic-python/README-extra.md
@@ -18,7 +18,7 @@ and pull CM repository with portable automation scripts to benchmark ML Systems:
 
 ```bash
 pip install cmind
-cm pull repo mlcommons@ck
+cm pull repo mlcommons@cm4mlops --checkout=dev
 ```
 
 ### Clean CM cache

diff --git a/script/app-mlperf-inference-nvidia/README-about.md b/script/app-mlperf-inference-nvidia/README-about.md
@@ -52,13 +52,13 @@ Assuming all the downloaded files are to the user home directory please do the f
     --cudnn_tar_file_path=$HOME/cudnn-linux-x86_64-8.9.2.26_cuda11-archive.tar.xz \
     --imagenet_path=$HOME/imagenet-2012-val \
     --scratch_path=$HOME/mlperf_scratch \
-    --docker_cm_repo=mlcommons@ck  \
+    --docker_cm_repo=mlcommons@cm4mlops  \
     --results_dir=$HOME/results_dir \
     --submission_dir=$HOME/submission_dir \
     --adr.compiler.tags=gcc
     ```
       * Use `--docker_cache=no` to turn off docker caching
-      * Use `--docker_run_cmd_prefix="cm pull repo mlcommons@ck"` to update the CK repository when docker caching is used
+      * Use `--docker_run_cmd_prefix="cm pull repo mlcommons@cm4mlops --checkout=dev"` to update the CK repository when docker caching is used
       * Use `--custom_system=no` if you are using a similar system to the [Nvidia submission systems for MLPerf inference 3.0](https://github.com/mlcommons/inference_results_v3.0/tree/main/closed/NVIDIA/systems).
 
 6. At the end of the build you'll get the following prompt unless you have chosen `--custom_system=no`. Please give a system name and say yes to generating the configuration files

diff --git a/script/app-mlperf-inference-nvidia/README.md b/script/app-mlperf-inference-nvidia/README.md
@@ -65,13 +65,13 @@ Assuming all the downloaded files are to the user home directory please do the f
     --cudnn_tar_file_path=$HOME/cudnn-linux-x86_64-8.9.2.26_cuda11-archive.tar.xz \
     --imagenet_path=$HOME/imagenet-2012-val \
     --scratch_path=$HOME/mlperf_scratch \
-    --docker_cm_repo=mlcommons@ck  \
+    --docker_cm_repo=mlcommons@cm4mlops  \
     --results_dir=$HOME/results_dir \
     --submission_dir=$HOME/submission_dir \
     --adr.compiler.tags=gcc
     ```
       * Use `--docker_cache=no` to turn off docker caching
-      * Use `--docker_run_cmd_prefix="cm pull repo mlcommons@ck"` to update the CK repository when docker caching is used
+      * Use `--docker_run_cmd_prefix="cm pull repo mlcommons@cm4mlops"` to update the CK repository when docker caching is used
       * Use `--custom_system=no` if you are using a similar system to the [Nvidia submission systems for MLPerf inference 3.0](https://github.com/mlcommons/inference_results_v3.0/tree/main/closed/NVIDIA/systems).
 
 6. At the end of the build you'll get the following prompt unless you have chosen `--custom_system=no`. Please give a system name and say yes to generating the configuration files

diff --git a/script/app-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md b/script/app-mlperf-inference-qualcomm/README_aws_dl2q.24xlarge.md
@@ -12,7 +12,7 @@ image from the Community AMIs is the recommended OS image as it comes with the Q
 ```
 sudo yum install -y python38-devel git
 python3.8 -m pip install cmind
-cm pull repo mlcommons@ck
+cm pull repo mlcommons@cm4mlops
 cm run script --tags=get,python --version_min=3.8.1
 ```
 

diff --git a/script/app-mlperf-inference/README-extra.md b/script/app-mlperf-inference/README-extra.md
@@ -18,16 +18,16 @@ source .profile
 Next you need to install a CM repository with [cross-platform CM scripts](https://github.com/mlcommons/cm4mlops/tree/main/script) for ML Systems:
 
 ```bash
-cm pull repo mlcommons@ck
+cm pull repo mlcommons@cm4mlops --checkout=dev
 ```
 
-Note that you can fork [this repository](https://github.com/mlcommons/ck) and use it instead of mlcommons@ck 
+Note that you can fork [this repository](https://github.com/mlcommons/cm4mlops) and use it instead of mlcommons@cm4mlops
 to add CM scripts for your own public and private ML models, data sets, software and hardware.
-In such case, just change mlcommons@ck to your own fork in the above command.
+In such case, just change mlcommons@cm4mlops to your own fork in the above command.
 
 You can find the location of this repository on your system as follows:
 ```bash
-cm find repo mlcommons@ck
+cm find repo mlcommons@cm4mlops
 ```
 
 Now we suggest you to set up a virtual python via CM to avoid mixing up your native Python installation:

diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml
@@ -244,6 +244,10 @@ variations:
       deps:
         - tags: get,mlperf,inference,nvidia,scratch,space
         - tags: get,nvidia-docker
+          skip_if_env:
+            CM_SKIP_GET_NVIDIA_DOCKER:
+              - yes
+
       mounts:
         - "${{ CM_CUDNN_TAR_FILE_PATH }}:${{ CM_CUDNN_TAR_FILE_PATH }}"
         - "${{ CM_TENSORRT_TAR_FILE_PATH }}:${{ CM_TENSORRT_TAR_FILE_PATH }}"

diff --git a/script/build-docker-image/_cm.json b/script/build-docker-image/_cm.json
diff --git a/script/build-dockerfile/_cm.json b/script/build-dockerfile/_cm.json
diff --git a/script/build-dockerfile/customize.py b/script/build-dockerfile/customize.py
@@ -41,7 +41,7 @@ def preprocess(i):
                 continue
             arg=env_
             if env_ in default_env: #other inputs to be done later
-                arg=arg+"="+default_env[env_]
+                arg=arg+"="+str(default_env[env_])
                 #build_args.append(arg)
                 #input_args.append("--"+input_+"="+"$"+env_)
 

diff --git a/script/build-mlperf-inference-server-nvidia/_cm.yaml b/script/build-mlperf-inference-server-nvidia/_cm.yaml
@@ -240,6 +240,10 @@ docker:
     - tags: get,mlperf,inference,results,dir
     - tags: get,mlperf,inference,submission,dir
     - tags: get,nvidia-docker
+      skip_if_env:
+        CM_SKIP_GET_NVIDIA_DOCKER:
+          - yes
+
   pre_run_cmds:
     - cm pull repo
   run_cmd_prefix: sudo apt remove -y cmake

diff --git a/script/generate-mlperf-tiny-report/README-extra.md b/script/generate-mlperf-tiny-report/README-extra.md
@@ -11,7 +11,7 @@ Install [MLCommons CM framework](https://github.com/mlcommons/ck/blob/master/doc
 
 Pull the MLCommons CK repository with automation recipes for interoperable MLOps:
 ```bash
-cm pull repo mlcommons@ck
+cm pull repo mlcommons@cm4mlops --checkout=dev
 ```
 
 Install repositories with raw MLPerf inference benchmark results:

diff --git a/script/get-cuda-devices/_cm.json b/script/get-cuda-devices/_cm.json