Merge branch 'main' into main

mlcommons · Oct 29, 2024 · 5e9c03c · 5e9c03c
2 parents cc3d251 + b209819
commit 5e9c03c
Show file tree

Hide file tree

Showing 36 changed files with 639 additions and 248 deletions.
diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml
@@ -19,7 +19,6 @@ jobs:
         source gh_action/bin/deactivate || python3 -m venv gh_action
         source gh_action/bin/activate
         export CM_REPOS=$HOME/GH_CM
-        cm rm repo mlcommons@cm4mlops -f
         python3 -m pip install cm4mlops
         cm pull repo
         cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions  --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean

diff --git a/.github/workflows/test-nvidia-mlperf-implementation.yml b/.github/workflows/test-nvidia-mlperf-implementation.yml
@@ -2,7 +2,7 @@ name: MLPerf Inference Nvidia implementations
 
 on:
   schedule:
-    - cron: "00 10 * * *" #to be adjusted
+    - cron: "04 18 * * *" #to be adjusted
 
 jobs:
   build_nvidia:
@@ -12,15 +12,14 @@ jobs:
         fail-fast: false
         matrix:
           python-version: [ "3.12" ]
-          model: [ "resnet50" ]
+          model: [ "resnet50",  "retinanet",  "bert-99", "bert-99.9", "gptj-99.9", "3d-unet-99.9" ]
       steps:
       - name: Test MLPerf Inference NVIDIA ${{ matrix.model }}
         run: |
           if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi
           python3 -m venv gh_action
           source gh_action/bin/activate
           export CM_REPOS=$HOME/GH_CM
-          cm rm repo mlcommons@cm4mlops -f
           pip install --upgrade cm4mlops
-          cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --execution_mode=valid --gpu_name=rtx_4090 --offline_target_qps=85000 --server_target_qps=73000 --submitter="MLCommons" --hw_name=gh_ubuntu_x86  --model=resnet50  --implementation=nvidia    --backend=tensorrt    --category=datacenter,edge --division=closed  --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cuda --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  --docker --quiet
-          cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from ${{ matrix.model }} GH action on NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
+          cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --execution_mode=valid --gpu_name=rtx_4090 --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=RTX4090x2  --implementation=nvidia    --backend=tensorrt    --category=datacenter,edge --division=closed  --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cuda --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  --docker --quiet
+          cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from GH action on NVIDIA_RTX4090x2" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=RTX4090x2
diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml
@@ -24,7 +24,6 @@ jobs:
         python3 -m venv gh_action
         source gh_action/bin/activate
         export CM_REPOS=$HOME/GH_CM
-        cm rm repo mlcommons@cm4mlops -f
         pip install --upgrade cm4mlops
         pip install tabulate
         cm pull repo

diff --git a/README.md b/README.md
@@ -13,9 +13,7 @@
 [![Test QAIC Software kit Compilation](https://github.com/mlcommons/cm4mlops/actions/workflows/test-qaic-software-kit.yml/badge.svg)](https://github.com/mlcommons/cm4mlops/actions/workflows/test-qaic-software-kit.yml)
 
 
-The `mlperf-branch` of the **cm4mlops** repository contains updated CM scripts specifically for MLPerf Inference, including support for Automotive. Please note that the general CM scripts in this branch may not be compatible with other projects. For more information on using CM for MLPerf Inference, visit the [MLPerf Inference Documentation site](https://docs.mlcommons.org/inference/).
-
-[![Streamline your MLPerf results using CM Framework](https://img.youtube.com/vi/eI1Hoecc3ho/0.jpg)](https://youtu.be/eI1Hoecc3ho)
+Please see the [docs](https://docs.mlcommons.org/cm4mlops/) site for understanding CM scripts better. The `mlperf-branch` of the **cm4mlops** repository contains updated CM scripts specifically for MLPerf Inference. For more information on using CM for MLPerf Inference, visit the [MLPerf Inference Documentation site](https://docs.mlcommons.org/inference/).
 
 ## News
 

diff --git a/automation/script/module_misc.py b/automation/script/module_misc.py
@@ -1086,7 +1086,9 @@ def doc(i):
 
 
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-def update_path_for_docker(path, mounts, force_path_target=''):
+# This function takes in a host path and returns the absolute path on host and the container
+# If mounts is passed, the function appends the host path and the container path to mounts in the form "host_path:container_path" 
+def update_path_for_docker(path, mounts=None, force_path_target=''):
 
     path_orig = ''
     path_target = ''
@@ -1114,14 +1116,14 @@ def update_path_for_docker(path, mounts, force_path_target=''):
             x = path_orig + ':' + path_target
 
         # CHeck if no duplicates
-        to_add = True
-        for y in mounts:
-            if y.lower()==x.lower():
-                to_add = False
-                break
-
-        if to_add:
-            mounts.append(x)
+        if mounts != None:
+            to_add = True
+            for y in mounts:
+                if y.lower()==x.lower():
+                    to_add = False
+                    break
+            if to_add:
+                mounts.append(x)
 
 
     return (path_orig, path_target)
@@ -1617,8 +1619,11 @@ def get_container_path(value):
             new_path_split1 = new_path_split + path_split[repo_entry_index:repo_entry_index+3]
             new_path_split2 = new_path_split + path_split[repo_entry_index:]
             return "/".join(new_path_split1), "/".join(new_path_split2)
+    else:
+        orig_path,target_path = update_path_for_docker(path=value)
+        return target_path, target_path
 
-    return value, value
+    # return value, value
 
 
 ############################################################

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -0,0 +1,135 @@
+
+# Getting Started with CM Script Automation
+
+## Running CM Scripts
+
+To execute a simple script in CM that captures OS details, use the following command:
+
+```bash
+cm run script --tags=detect,os -j
+```
+
+This command gathers details about the system on which it's run, such as:
+
+```json
+{
+    "CM_HOST_OS_TYPE": "linux",
+    "CM_HOST_OS_BITS": "64",
+    "CM_HOST_OS_FLAVOR": "ubuntu",
+    "CM_HOST_OS_FLAVOR_LIKE": "debian",
+    "CM_HOST_OS_VERSION": "24.04",
+    "CM_HOST_OS_KERNEL_VERSION": "6.8.0-45-generic",
+    "CM_HOST_OS_GLIBC_VERSION": "2.39",
+    "CM_HOST_OS_MACHINE": "x86_64",
+    "CM_HOST_OS_PACKAGE_MANAGER": "apt",
+    "CM_HOST_OS_PACKAGE_MANAGER_INSTALL_CMD": "DEBIAN_FRONTEND=noninteractive apt-get install -y",
+    "CM_HOST_OS_PACKAGE_MANAGER_UPDATE_CMD": "apt-get update -y",
+    "+CM_HOST_OS_DEFAULT_LIBRARY_PATH": [
+      "/usr/local/lib/x86_64-linux-gnu",
+      "/lib/x86_64-linux-gnu",
+      "/usr/lib/x86_64-linux-gnu",
+      "/usr/lib/x86_64-linux-gnu64",
+      "/usr/local/lib64",
+      "/lib64",
+      "/usr/lib64",
+      "/usr/local/lib",
+      "/lib",
+      "/usr/lib",
+      "/usr/x86_64-linux-gnu/lib64",
+      "/usr/x86_64-linux-gnu/lib"
+    ],
+    "CM_HOST_PLATFORM_FLAVOR": "x86_64",
+    "CM_HOST_PYTHON_BITS": "64",
+    "CM_HOST_SYSTEM_NAME": "intel-spr-i9"
+}
+```
+
+For more details on CM scripts, see the [CM documentation](index.md).
+
+### Adding New CM Scripts
+
+CM aims to provide lightweight connectors between existing automation scripts and tools without substituting them. You can add your own scripts to CM with the following command, which creates a script named `hello-world`:
+
+```bash
+cm add script hello-world --tags=hello-world,display,test
+```
+
+This command initializes a CM script in the local repository with the following structure:
+
+```
+└── CM
+    ├── index.json
+    ├── repos
+    │   ├── local
+    │   │   ├── cfg
+    │   │   ├── cache
+    │   │   ├── cmr.yaml
+    │   │   └── script
+    │   │       └── hello-world
+    │   │           ├── _cm.yaml
+    │   │           ├── customize.py
+    │   │           ├── README-extra.md
+    │   │           ├── run.bat
+    │   │           └── run.sh
+    │   └── mlcommons@cm4mlops
+    └── repos.json
+```
+
+You can also execute the script from Python as follows:
+
+```python
+import cmind
+output = cmind.access({'action':'run', 'automation':'script', 'tags':'hello-world,display,test'})
+if output['return'] == 0:
+    print(output)
+```
+
+If you discover that your new script is similar to an existing script in any CM repository, you can clone an existing script using the following command:
+
+```bash
+cm copy script <source_script> .:<target_script>
+```
+
+Here, `<source_script>` is the name of the existing script, and `<target_script>` is the name of the new script you're creating. Existing script names in the `cm4mlops` repository can be found [here](https://github.com/mlcommons/cm4mlops/tree/mlperf-inference/script).
+
+## Caching and Reusing CM Script Outputs
+
+By default, CM scripts run in the current directory and record all new files there. For example, a universal download script might download an image to the current directory:
+
+```bash
+cm run script --tags=download,file,_wget --url=https://cKnowledge.org/ai/data/computer_mouse.jpg --verify=no --env.CM_DOWNLOAD_CHECKSUM=45ae5c940233892c2f860efdf0b66e7e
+```
+
+To cache and reuse the output of scripts, CM offers a `cache` automation feature similar to `script`. When `"cache":true` is specified in a script's metadata, CM will create a `cache` directory in `$HOME/CM/repos/local` with a unique ID and the same tags as `script`, and execute the script there.
+
+Subsequent executions of the same script will reuse files from the cache, avoiding redundancy. This is especially useful for large files or data sets.
+
+You can manage cache entries and find specific ones using commands like:
+
+```bash
+cm show cache
+cm show cache --tags=get,ml-model,resnet50,_onnx
+cm find cache --tags=download,file,ml-model,resnet50,_onnx
+cm info cache --tags=download,file,ml-model,resnet50,_onnx
+```
+
+To clean cache entries:
+
+```bash
+cm rm cache --tags=ml-model,resnet50
+cm rm cache -f  # Clean all entries
+```
+
+You can completely reset the CM framework by removing the `$HOME/CM` directory, which deletes all downloaded repositories and cached entries.
+
+## Integration with Containers
+
+CM scripts are designed to run natively or inside containers with the same commands. You can substitute `cm run script` with `cm docker script` to execute a script inside an automatically-generated container:
+
+```bash
+cm docker script --tags=python,app,image-classification,onnx,_cpu
+```
+
+CM automatically handles the generation of Dockerfiles, building of containers, and execution within containers, providing a seamless experience whether running scripts natively or in containers. 
+
+This approach simplifies the development process by eliminating the need for separate Dockerfile maintenance and allows for the use of native scripts and workflows directly within containers.
diff --git a/docs/index.md b/docs/index.md
@@ -4,7 +4,7 @@ Please check the [CM documentation](https://docs.mlcommons.org/ck) for more deta
 
 See the [automatically generated catalog](scripts/index.md) of all CM scripts from MLCommons.
 
-## Getting started with CM scripts
+## Understanding CM scripts
 
 * A CM script is identified by a set of tags and by unique ID. 
 * Further each CM script can have multiple variations and they are identified by variation tags which are treated in the same way as tags and identified by a `_` prefix.
@@ -69,8 +69,8 @@ Sometimes it is difficult to add all variations needed for a script like say `ba
 * By using `--new` input, a new cache entry can be forced even when an old one exist. 
 * By default no depndencies are run for a cached entry unless `dynamic` key is set for it. 
 
-### Updating ENV from inside the run script
-* [TBD]
+
+Please see [here](getting-started.md) for trying CM scripts.
 
 
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -20,6 +20,7 @@ theme:
     - toc.follow
 nav:
   - HOME: index.md
+  - Getting Started: getting-started.md
   - CM Scripts:
     - scripts/index.md
     - Python-automation: scripts/Python-automation/index.md

diff --git a/script/app-image-classification-onnx-py/_cm.yaml b/script/app-image-classification-onnx-py/_cm.yaml
@@ -106,7 +106,7 @@ input_description:
 docker:
   skip_run_cmd: 'no'
   skip_cm_sys_upgrade: 'yes'
-  cm_repo_flags: '--checkout=dev'
+  cm_repo_flags: '--branch=dev'
   use_host_group_id: 'yes'
   image_tag_extra: '-cm-dev'
   input_paths:

diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml
@@ -995,6 +995,9 @@ variations:
       - tags: get,generic-python-lib,_package.more-itertools
         names:
           - more-itertools
+      - tags: get,generic-python-lib,_package.compressed_tensors
+        names:
+          - compressed_tensors
 
   llama2-70b-99:
     group: models

diff --git a/script/app-mlperf-inference-mlcommons-python/customize.py b/script/app-mlperf-inference-mlcommons-python/customize.py
@@ -300,11 +300,12 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio
                 " --dataset-path " + env['CM_DATASET_PATH_ROOT'] + \
                 ' --dtype ' + env['CM_MLPERF_MODEL_PRECISION'].replace("bfloat", "bf").replace("float", "fp") + \
                 " --device " + device + \
-                " --max-batchsize " + max_batchsize + \
                  env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] + \
                  scenario_extra_options + mode_extra_options + \
                 " --output " + env['CM_MLPERF_OUTPUT_DIR'] + \
                 " --model-path " + env['CM_ML_MODEL_PATH']
+        if "--max-batchsize" not in cmd:
+            cmd += " --max-batchsize " + max_batchsize
         if env.get('CM_COCO2014_SAMPLE_ID_PATH','') != '':
             cmd += " --ids-path " + env['CM_COCO2014_SAMPLE_ID_PATH']
 

diff --git a/script/app-mlperf-inference-nvidia/_cm.yaml b/script/app-mlperf-inference-nvidia/_cm.yaml
@@ -430,8 +430,6 @@ variations:
       CM_ML_MODEL_WEIGHTS_DATA_TYPE: int8
     deps:
     - tags: get,generic-python-lib,_Pillow
-    - tags: get,generic-python-lib,_torch
-    - tags: get,generic-python-lib,_torchvision
     - tags: get,generic-python-lib,_opencv-python
     - tags: get,generic-python-lib,_numpy
     - tags: get,generic-python-lib,_pycocotools
@@ -527,6 +525,7 @@ variations:
     - tags: get,generic-python-lib,_transformers
     - tags: get,generic-python-lib,_safetensors
     - tags: get,generic-python-lib,_onnx
+    - tags: get,generic-python-lib,_package.sympy
     - tags: get,generic-python-lib,_onnx-graphsurgeon
 
   bert-99:
@@ -594,10 +593,10 @@ variations:
       CM_ML_MODEL_WEIGHTS_DATA_TYPE: fp16
     deps:
     - tags: get,generic-python-lib,_toml
-    - tags: get,generic-python-lib,_torchvision
+    - tags: get,generic-python-lib,_torchvision_cuda
       names:
       - torchvision
-    - tags: get,generic-python-lib,_torch
+    - tags: get,generic-python-lib,_torch_cuda
     - tags: get,generic-python-lib,_nvidia-apex
     - tags: get,generic-python-lib,_unidecode
     - tags: get,generic-python-lib,_inflect
@@ -613,7 +612,6 @@ variations:
       - CM_DLRM_V2_AGGREGATION_TRACE_FILE_PATH
     deps:
     - tags: get,dlrm,data,mlperf,inference,_nvidia
-    - tags: get,generic-python-lib,_torch
     - tags: get,generic-python-lib,_package.torchsnapshot
     - tags: get,generic-python-lib,_package.torchrec
       version: 0.3.2
@@ -762,6 +760,11 @@ variations:
       CM_MLPERF_NVIDIA_HARNESS_USE_TRITON: "yes"
       CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX3: "using_triton"
 
+  use-graphs:
+    group: graphs
+    env:
+     CM_MLPERF_NVIDIA_HARNESS_USE_GRAPHS: "yes"
+
   prebuild:
     group: run-mode
     env:
@@ -1106,7 +1109,8 @@ variations:
 
   singlestream,resnet50:
     env:
-      SKIP_POLICIES: '1'
+      CM_MLPERF_NVIDIA_HARNESS_DISABLE_BETA1_SMALLK: yes
+      SKIP_POLICIES: '0' # skip_policies used to give better latency but is not working with 4.0 and later Nvidia codes
 
   server,resnet50:
     env:
@@ -1118,7 +1122,8 @@ variations:
 
   multistream,resnet50:
     env:
-      SKIP_POLICIES: '1'
+      CM_MLPERF_NVIDIA_HARNESS_DISABLE_BETA1_SMALLK: yes
+      SKIP_POLICIES: '0'
 
   singlestream,run_harness:
     default_variations:
@@ -1435,12 +1440,12 @@ variations:
   rtx_4090,sdxl,offline,run_harness:
     default_variations:
       batch-size: batch_size.2
-      use_graphs: "True"
+      graphs: use-graphs
 
   rtx_4090,sdxl,server,run_harness:
     default_variations:
       batch-size: batch_size.2
-      use_graphs: "True"
+      graphs: use-graphs
 
   rtx_4090,resnet50,offline,run_harness:
     default_variations:
@@ -1449,7 +1454,7 @@ variations:
   rtx_4090,resnet50,server,run_harness:
     default_variations:
       batch-size: batch_size.32
-      use_graphs: "True"
+      graphs: use-graphs
 
   rtx_4090,retinanet,offline,run_harness:
     default_variations: