cleaning up docs and improving MLPerf inference benchmaks (#1079)

mlcommons · Jan 30, 2024 · 582190e · 582190e
2 parents 808ed7a + bf17659
commit 582190e
Show file tree

Hide file tree

Showing 15 changed files with 51 additions and 175 deletions.
diff --git a/README.md b/README.md
@@ -77,6 +77,9 @@ cmr "get generic-python-lib _package.torchvision" --version=0.16.2
 cmr "python app image-classification torch" --input=computer_mouse.jpg
 
 
+cm rm repo mlcommons@ck
+cm pull repo --url=https://zenodo.org/records/10581696/files/cm-mlops-repo-20240129.zip
+
 cmr "install llvm prebuilt" --version=17.0.6
 cmr "app image corner-detection"
 

diff --git a/cm-mlops/automation/script/module_misc.py b/cm-mlops/automation/script/module_misc.py
@@ -1391,15 +1391,21 @@ def dockerfile(i):
         run_cmd  = r['run_cmd_string']
 
 
-
+        docker_base_image = i.get('docker_base_image', docker_settings.get('base_image'))
         docker_os = i.get('docker_os', docker_settings.get('docker_os', 'ubuntu'))
         docker_os_version = i.get('docker_os_version', docker_settings.get('docker_os_version', '22.04'))
+        if not docker_base_image:
+            dockerfilename_suffix = docker_os +'_'+docker_os_version
+        else:
+            dockerfilename_suffix = docker_base_image.split("/")
+            dockerfilename_suffix = dockerfilename_suffix[len(dockerfilename_suffix) - 1]
+
         fake_run_deps = i.get('fake_run_deps', docker_settings.get('fake_run_deps', False))
         docker_run_final_cmds = docker_settings.get('docker_run_final_cmds', [])
 
         gh_token = i.get('docker_gh_token')
 
-        if i.get('docker_real_run', False):
+        if i.get('docker_real_run', docker_settings.get('docker_real_run',False)):
             fake_run_option = " "
             fake_run_deps = False
         else:
@@ -1409,7 +1415,7 @@ def dockerfile(i):
 
         env['CM_DOCKER_PRE_RUN_COMMANDS'] = docker_run_final_cmds
 
-        dockerfile_path = os.path.join(script_path,'dockerfiles', docker_os +'_'+docker_os_version +'.Dockerfile')
+        dockerfile_path = os.path.join(script_path,'dockerfiles', dockerfilename_suffix +'.Dockerfile')
         if i.get('print_deps'):
             cm_input = {'action': 'run',
                     'automation': 'script',
@@ -1434,6 +1440,7 @@ def dockerfile(i):
                             'automation': 'script',
                             'tags': 'build,dockerfile',
                             'cm_repo': cm_repo,
+                            'docker_base_image': docker_base_image,
                             'docker_os': docker_os,
                             'docker_os_version': docker_os_version,
                             'file_path': dockerfile_path,
@@ -1659,9 +1666,18 @@ def docker(i):
 
         mount_string = "" if len(mounts)==0 else ",".join(mounts)
 
+        docker_base_image = i.get('docker_base_image', docker_settings.get('base_image'))
+        docker_os = i.get('docker_os', docker_settings.get('docker_os', 'ubuntu'))
+        docker_os_version = i.get('docker_os_version', docker_settings.get('docker_os_version', '22.04'))
+        if not docker_base_image:
+            dockerfilename_suffix = docker_os +'_'+docker_os_version
+        else:
+            dockerfilename_suffix = docker_base_image.split("/")
+            dockerfilename_suffix = dockerfilename_suffix[len(dockerfilename_suffix) - 1]
+
         cm_repo=i.get('docker_cm_repo', 'mlcommons@ck')
 
-        dockerfile_path = os.path.join(script_path,'dockerfiles', _os +'_'+version +'.Dockerfile')
+        dockerfile_path = os.path.join(script_path,'dockerfiles', dockerfilename_suffix +'.Dockerfile')
 
         docker_skip_run_cmd = i.get('docker_skip_run_cmd', docker_settings.get('skip_run_cmd', False)) #skips docker run cmd and gives an interactive shell to the user
 
@@ -1712,15 +1728,16 @@ def docker(i):
                            'automation': 'script',
                            'tags': 'run,docker,container',
                            'recreate': 'yes',
+                           'docker_base_image': docker_base_image,
                            'docker_os': _os,
+                           'docker_os_version': version,
                            'cm_repo': cm_repo,
                            'env': env,
                            'image_repo': image_repo,
                            'interactive': interactive,
                            'mounts': mounts,
                            'image_name': 'cm-script-'+script_alias,
 #                            'image_tag': script_alias,
-                           'docker_os_version': version,
                            'detached': detached,
                            'script_tags': f'{tag_string}',
                            'run_cmd': run_cmd if docker_skip_run_cmd not in [ 'yes', True, 'True' ] else 'echo "cm version"',

diff --git a/cm-mlops/script/app-mlperf-inference-cpp/_cm.yaml b/cm-mlops/script/app-mlperf-inference-cpp/_cm.yaml
@@ -250,7 +250,7 @@ variations:
 
   offline,resnet50:
     default_variations:
-      batch-size: batch-size.8
+      batch-size: batch-size.32
 
   multistream,retinanet:
     default_variations:

diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/_cm.yaml
@@ -58,7 +58,7 @@ variations:
   nvidia:
     group: implementation
     env:
-      IMPLEMENTATION: nvidia
+      IMPLEMENTATION: nvidia-original
     default_env:
       MODELS: resnet50,retinanet,bert-99,bert-99.9,3d-unet-99,rnnt,gptj-99,gptj-99.9,dlrmv2-99,dlrmv2-99.9
       BACKENDS: tensorrt

diff --git a/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py b/cm-mlops/script/benchmark-any-mlperf-inference-implementation/customize.py
@@ -109,9 +109,9 @@ def preprocess(i):
                             test_query_count = 100
                     else:
                         if model == "resnet50":
-                            test_query_count = 10000
+                            test_query_count = 40000
                         else:
-                            test_query_count = 1000
+                            test_query_count = 2000
                     cmd = f'run_test "{model}" "{backend}" "{test_query_count}" "{implementation}" "{device}" "$find_performance_cmd"'
                     cmds.append(cmd)
                     #second argument is unused for submission_cmd

diff --git a/cm-mlops/script/build-dockerfile/_cm.json b/cm-mlops/script/build-dockerfile/_cm.json
@@ -10,6 +10,7 @@
     "cm_repo": "CM_MLOPS_REPO",
     "docker_os": "CM_DOCKER_OS",
     "docker_os_version": "CM_DOCKER_OS_VERSION",
+    "docker_base_image": "CM_DOCKER_IMAGE_BASE",
     "fake_run_option": "CM_DOCKER_FAKE_RUN_OPTION",
     "file_path": "CM_DOCKERFILE_WITH_PATH",
     "gh_token": "CM_GH_TOKEN",

diff --git a/cm-mlops/script/build-mlperf-inference-server-nvidia/_cm.yaml b/cm-mlops/script/build-mlperf-inference-server-nvidia/_cm.yaml
@@ -197,6 +197,10 @@ versions:
 docker:
   skip_run_cmd: 'no'
   all_gpus: 'yes'
+  docker_os: ubuntu
+  docker_real_run: True
+  docker_os_version: '20.04'
+  base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v3.1-cuda12.2-cudnn8.9-x86_64-ubuntu20.04-public
   docker_input_mapping:
     imagenet_path: IMAGENET_PATH
     results_dir: RESULTS_DIR
@@ -213,6 +217,3 @@ docker:
    - "${{ CM_TENSORRT_TAR_FILE_PATH }}:${{ CM_TENSORRT_TAR_FILE_PATH }}"
    - "${{ CUDA_RUN_FILE_LOCAL_PATH }}:${{ CUDA_RUN_FILE_LOCAL_PATH }}"
    - "${{ MLPERF_SCRATCH_PATH }}:${{ MLPERF_SCRATCH_PATH }}"
-  pre_run_cmds:
-    - cm pull repo mlcommons@ck
-    - cm run script --tags=get,dataset,original,imagenet,_full --imagenet_path=/data/imagenet-val
diff --git a/cm-mlops/script/get-cuda/_cm.json b/cm-mlops/script/get-cuda/_cm.json
@@ -10,7 +10,6 @@
     "CM_REQUIRE_INSTALL": "no"
   },
   "docker": {
-    "run": false
   },
   "input_mapping": {
     "cudnn_tar_path": "CM_CUDNN_TAR_FILE_PATH",

diff --git a/cm-mlops/script/get-generic-python-lib/_cm.json b/cm-mlops/script/get-generic-python-lib/_cm.json
@@ -389,8 +389,8 @@
         }
       ],
       "env": {
-        "CM_GENERIC_PYTHON_PACKAGE_NAME": "nvidia-dali-cuda110",
-        "CM_GENERIC_PYTHON_PIP_EXTRA": " --upgrade",
+        "CM_GENERIC_PYTHON_PACKAGE_NAME": "nvidia-dali-cuda120",
+        "CM_GENERIC_PYTHON_PIP_EXTRA": " --upgrade --default-timeout=900",
         "CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL": "https://developer.download.nvidia.com/compute/redist"
       },
       "new_env_keys": [

diff --git a/cm-mlops/script/get-ml-model-bert-large-squad/_cm.json b/cm-mlops/script/get-ml-model-bert-large-squad/_cm.json
@@ -100,7 +100,8 @@
     },
     "onnx,int8": {
       "env": {
-        "CM_ML_MODEL_F1": "90.067"
+        "CM_ML_MODEL_F1": "90.067",
+        "CM_PACKAGE_URL": "https://zenodo.org/record/3750364/files/bert_large_v1_1_fake_quant.onnx"
       }
     },
     "onnx,int8,zenodo": {

diff --git a/cm-mlops/script/get-tensorrt/customize.py b/cm-mlops/script/get-tensorrt/customize.py
@@ -11,7 +11,8 @@ def preprocess(i):
     env = i['env']
 
 
-    if env.get('CM_TENSORRT_TAR_FILE_PATH','')=='' and env.get('CM_TENSORRT_REQUIRE_DEV', '') != 'yes' and env.get('CM_HOST_PLATFORM_FLAVOR', '') != 'aarch64':
+    #Not enforcing dev requirement for now
+    if env.get('CM_TENSORRT_TAR_FILE_PATH','')=='' and env.get('CM_TENSORRT_REQUIRE_DEV1', '') != 'yes' and env.get('CM_HOST_PLATFORM_FLAVOR', '') != 'aarch64':
 
        if os_info['platform'] == 'windows':
            extra_pre=''

diff --git a/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml b/cm-mlops/script/reproduce-mlperf-inference-nvidia/_cm.yaml
@@ -299,6 +299,10 @@ variations:
     default: true
     env:
       CM_MODEL: resnet50
+    deps:
+    - tags: get,generic-python-lib,_onnx-graphsurgeon
+    - tags: get,generic-python-lib,_package.onnx
+      version: 1.13.1
 
   retinanet:
     group: model
@@ -312,6 +316,9 @@ variations:
     - tags: get,generic-python-lib,_opencv-python
     - tags: get,generic-python-lib,_numpy
     - tags: get,generic-python-lib,_pycocotools
+    - tags: get,generic-python-lib,_onnx-graphsurgeon
+    - tags: get,generic-python-lib,_package.onnx
+      version: 1.13.1
 
   bert_:
     deps:

diff --git a/cm-mlops/script/run-docker-container/_cm.json b/cm-mlops/script/run-docker-container/_cm.json
@@ -22,6 +22,7 @@
     "image_tag": "CM_DOCKER_IMAGE_TAG",
     "docker_os": "CM_DOCKER_OS",
     "docker_os_version": "CM_DOCKER_OS_VERSION",
+    "docker_image_base": "CM_DOCKER_IMAGE_BASE",
     "script_tags": "CM_DOCKER_RUN_SCRIPT_TAGS",
     "run_cmd_extra": "CM_DOCKER_RUN_CMD_EXTRA",
     "real_run": "CM_REAL_RUN",

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -1,160 +1,4 @@
 [ [Back to documentation](README.md) ]
 
-*Under development*
-
-# CM Getting Started Guide
-
-## Image classification example
-
-One of the goals of the [MLCommons CM workflow automation framework (CM)](https://github.com/mlcommons/ck?tab=readme-ov-file#about) 
-is to provide a common, simple and human readable interface to run and manage complex software projects and benchmarks 
-on any platform with any software stack in a unified and automated way.
-
-This tutorial explains how CM works and should help you start using it with existing projects 
-or to modularize and unify your own projects.
-
-Let us test CM to run image classification from the command line on any platform with Windows, Linux and MacOS.
-
-### Installing CM
-
-CM is implemented as a [very small Python library](https://github.com/mlcommons/ck/tree/master/cm/cmind) 
-with `cm` and `cmr` front-ends and minimal dependencies (Python 3+, git and wget) 
-that can be installed via PIP:
-
-
-```bash
-pip install cmind
-```
-
-You may need to re-login to update the PATH to `cm` and `cmr` front-ends.
-
-Note that CM can be also installed from virtual environment (required in Ubuntu 23.04+) and inside containers.
-You can check a detailed guide to install CM on different platforms [here](installation.md).
-
-### Pulling some repository with embedded CM interface
-
-Let's now pull a Git repository that has embedded CM interface 
-(note that if your Git repository doesn't have CM interface embedded,
-CM will automatically initialize one):
-
-```bash
-cm pull repo mlcommons@ck
-```
-
-CM will pull GitHub repository from `https://github.com/mlcommons/ck` to the `CM/repos` directory in your local HOME directory.
-You can use flag `--url=https://github.com/mlcommons/ck` instead of `mlcommons@ck` to pull any Git repository.
-
-CM will then check if this repository has a CM interface by checking the [`cmr.yaml`](https://github.com/mlcommons/ck/blob/master/cmr.yaml) 
-file in the root directory of this repository (abbreviation for `C`ollective `M`ind `R`epository):
-
-```yaml
-git: true
-alias: mlcommons@ck
-uid: a4705959af8e447a
-version: 1.5.4
-prefix: cm-mlops
-```
-
-Note that this file will be automatically generated if it doesn't exist in your repository.
-
-While working on modularizing, unifying and automating MLPerf benchmarks,
-we decided to embed a CM interface to this development repository 
-in the [cm-mlops directory](https://github.com/mlcommons/ck/tree/master/cm-mlops)
-
-The `prefix` in `cmr.yaml` tells CM to search for the CM interface in some sub-directory of a given repository
-to avoid altering the original structure of software projects.
-
-### Using CM interface to run a given software project
-
-You can now invoke a human-friendly CM command to run your project such as image classification
-(we will show how to use Python API later):
-
-```bash
-cm run script "python app image-classification onnx"
-```
-
-CM will recursively walk through all pulled or downloaded repositories in your home `CM/repos` directory
-and search for matching tags `python,app,image-classification,onnx` in all `_cm.yaml` or `_cm.json`
-files in a `script` sub-directory of all repositories.
-
-In our case, CM will find 1 match in 
-the [`cm-mlops/script/app-image-classification-onnx-py/_cm.yaml`](https://github.com/mlcommons/ck/blob/master/cm-mlops/script/app-image-classification-onnx-py/_cm.yaml).
-
-This file tells CM how to prepare environment variables, paths and command lines 
-to run a native script or tool on any platform.
-
-
-#### 
-
-
-
-
-
-
-
-
-
-
-
-
-#### Using inputs and environment variables
-
-env
-
-const
-
-
-default_env
-
-input_mapping
-
-
-
-#### Using variations
-
-using the same code/script/tool but altering it's behavior and sub-dependencies
-
-CUDA
-
-_cuda
-
-
-#### Reporting issues
-
-The community helped us test this example on many platforms but if you still encounter
-some issues, please report them [here](https://github.com/mlcommons/ck/issues) - CM is not a magic (yet)
-and our concept is to collaboratively extend CM workflows to gradually improve their portability and reproducibility
-across diverse software and hardware.
-
-
-
-#### Debugging CM interface
-
-
-#### Extending CM interface
-
-
-### Reusing automation recipes
-
-
-
-### Adding CM interface to your own project
-
-
-
-### Using CM with containers
-
-
-### Using CM GUI
-
-
-### Running MLPerf and other projects via CM 
-
-Recent examples from MLPerf and ML, compiler and systems conferences
-
-
-### Participating in collaborative developments
-
-This is a community project being developed by the [MLCommons Task Force on Automation and Reproducibility](taskforce.md)
-based on your feedback - please join our [public Discord server](https://discord.gg/JjWNWXKxwT) if you 
-would like to help with developments or have questions, suggestions and feature requests.
+*20240130: we are updating this page based on the feedback from the [CM users and MLPerf submitters](https://github.com/mlcommons/ck/issues/1052) -
+ it should be ready within a week - please [stay tuned](https://discord.gg/JjWNWXKxwT)*.
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
+cmind>=1.6.0
 pyyaml