Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
arjunsuresh authored Oct 29, 2024
2 parents cc3d251 + b209819 commit 5e9c03c
Show file tree
Hide file tree
Showing 36 changed files with 639 additions and 248 deletions.
1 change: 0 additions & 1 deletion .github/workflows/test-mlperf-inference-sdxl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ jobs:
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
cm rm repo mlcommons@cm4mlops -f
python3 -m pip install cm4mlops
cm pull repo
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
Expand Down
9 changes: 4 additions & 5 deletions .github/workflows/test-nvidia-mlperf-implementation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: MLPerf Inference Nvidia implementations

on:
schedule:
- cron: "00 10 * * *" #to be adjusted
- cron: "04 18 * * *" #to be adjusted

jobs:
build_nvidia:
Expand All @@ -12,15 +12,14 @@ jobs:
fail-fast: false
matrix:
python-version: [ "3.12" ]
model: [ "resnet50" ]
model: [ "resnet50", "retinanet", "bert-99", "bert-99.9", "gptj-99.9", "3d-unet-99.9" ]
steps:
- name: Test MLPerf Inference NVIDIA ${{ matrix.model }}
run: |
if [ -f "gh_action/bin/deactivate" ]; then source gh_action/bin/deactivate; fi
python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
cm rm repo mlcommons@cm4mlops -f
pip install --upgrade cm4mlops
cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --execution_mode=valid --gpu_name=rtx_4090 --offline_target_qps=85000 --server_target_qps=73000 --submitter="MLCommons" --hw_name=gh_ubuntu_x86 --model=resnet50 --implementation=nvidia --backend=tensorrt --category=datacenter,edge --division=closed --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cuda --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean --docker --quiet
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from ${{ matrix.model }} GH action on NVIDIARTX4090" --quiet --submission_dir=$HOME/gh_action_submissions
cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --execution_mode=valid --gpu_name=rtx_4090 --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=RTX4090x2 --implementation=nvidia --backend=tensorrt --category=datacenter,edge --division=closed --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cuda --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean --docker --quiet
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from GH action on NVIDIA_RTX4090x2" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=RTX4090x2
1 change: 0 additions & 1 deletion .github/workflows/test-scc24-sdxl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ jobs:
python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
cm rm repo mlcommons@cm4mlops -f
pip install --upgrade cm4mlops
pip install tabulate
cm pull repo
Expand Down
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
[![Test QAIC Software kit Compilation](https://github.com/mlcommons/cm4mlops/actions/workflows/test-qaic-software-kit.yml/badge.svg)](https://github.com/mlcommons/cm4mlops/actions/workflows/test-qaic-software-kit.yml)


The `mlperf-branch` of the **cm4mlops** repository contains updated CM scripts specifically for MLPerf Inference, including support for Automotive. Please note that the general CM scripts in this branch may not be compatible with other projects. For more information on using CM for MLPerf Inference, visit the [MLPerf Inference Documentation site](https://docs.mlcommons.org/inference/).

[![Streamline your MLPerf results using CM Framework](https://img.youtube.com/vi/eI1Hoecc3ho/0.jpg)](https://youtu.be/eI1Hoecc3ho)
Please see the [docs](https://docs.mlcommons.org/cm4mlops/) site for understanding CM scripts better. The `mlperf-branch` of the **cm4mlops** repository contains updated CM scripts specifically for MLPerf Inference. For more information on using CM for MLPerf Inference, visit the [MLPerf Inference Documentation site](https://docs.mlcommons.org/inference/).

## News

Expand Down
25 changes: 15 additions & 10 deletions automation/script/module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,7 +1086,9 @@ def doc(i):


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
def update_path_for_docker(path, mounts, force_path_target=''):
# This function takes in a host path and returns the absolute path on host and the container
# If mounts is passed, the function appends the host path and the container path to mounts in the form "host_path:container_path"
def update_path_for_docker(path, mounts=None, force_path_target=''):

path_orig = ''
path_target = ''
Expand Down Expand Up @@ -1114,14 +1116,14 @@ def update_path_for_docker(path, mounts, force_path_target=''):
x = path_orig + ':' + path_target

# CHeck if no duplicates
to_add = True
for y in mounts:
if y.lower()==x.lower():
to_add = False
break

if to_add:
mounts.append(x)
if mounts != None:
to_add = True
for y in mounts:
if y.lower()==x.lower():
to_add = False
break
if to_add:
mounts.append(x)


return (path_orig, path_target)
Expand Down Expand Up @@ -1617,8 +1619,11 @@ def get_container_path(value):
new_path_split1 = new_path_split + path_split[repo_entry_index:repo_entry_index+3]
new_path_split2 = new_path_split + path_split[repo_entry_index:]
return "/".join(new_path_split1), "/".join(new_path_split2)
else:
orig_path,target_path = update_path_for_docker(path=value)
return target_path, target_path

return value, value
# return value, value


############################################################
Expand Down
135 changes: 135 additions & 0 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@

# Getting Started with CM Script Automation

## Running CM Scripts

To execute a simple script in CM that captures OS details, use the following command:

```bash
cm run script --tags=detect,os -j
```

This command gathers details about the system on which it's run, such as:

```json
{
"CM_HOST_OS_TYPE": "linux",
"CM_HOST_OS_BITS": "64",
"CM_HOST_OS_FLAVOR": "ubuntu",
"CM_HOST_OS_FLAVOR_LIKE": "debian",
"CM_HOST_OS_VERSION": "24.04",
"CM_HOST_OS_KERNEL_VERSION": "6.8.0-45-generic",
"CM_HOST_OS_GLIBC_VERSION": "2.39",
"CM_HOST_OS_MACHINE": "x86_64",
"CM_HOST_OS_PACKAGE_MANAGER": "apt",
"CM_HOST_OS_PACKAGE_MANAGER_INSTALL_CMD": "DEBIAN_FRONTEND=noninteractive apt-get install -y",
"CM_HOST_OS_PACKAGE_MANAGER_UPDATE_CMD": "apt-get update -y",
"+CM_HOST_OS_DEFAULT_LIBRARY_PATH": [
"/usr/local/lib/x86_64-linux-gnu",
"/lib/x86_64-linux-gnu",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/x86_64-linux-gnu64",
"/usr/local/lib64",
"/lib64",
"/usr/lib64",
"/usr/local/lib",
"/lib",
"/usr/lib",
"/usr/x86_64-linux-gnu/lib64",
"/usr/x86_64-linux-gnu/lib"
],
"CM_HOST_PLATFORM_FLAVOR": "x86_64",
"CM_HOST_PYTHON_BITS": "64",
"CM_HOST_SYSTEM_NAME": "intel-spr-i9"
}
```

For more details on CM scripts, see the [CM documentation](index.md).

### Adding New CM Scripts

CM aims to provide lightweight connectors between existing automation scripts and tools without substituting them. You can add your own scripts to CM with the following command, which creates a script named `hello-world`:

```bash
cm add script hello-world --tags=hello-world,display,test
```

This command initializes a CM script in the local repository with the following structure:

```
└── CM
├── index.json
├── repos
│ ├── local
│ │ ├── cfg
│ │ ├── cache
│ │ ├── cmr.yaml
│ │ └── script
│ │ └── hello-world
│ │ ├── _cm.yaml
│ │ ├── customize.py
│ │ ├── README-extra.md
│ │ ├── run.bat
│ │ └── run.sh
│ └── mlcommons@cm4mlops
└── repos.json
```

You can also execute the script from Python as follows:

```python
import cmind
output = cmind.access({'action':'run', 'automation':'script', 'tags':'hello-world,display,test'})
if output['return'] == 0:
print(output)
```

If you discover that your new script is similar to an existing script in any CM repository, you can clone an existing script using the following command:

```bash
cm copy script <source_script> .:<target_script>
```

Here, `<source_script>` is the name of the existing script, and `<target_script>` is the name of the new script you're creating. Existing script names in the `cm4mlops` repository can be found [here](https://github.com/mlcommons/cm4mlops/tree/mlperf-inference/script).

## Caching and Reusing CM Script Outputs

By default, CM scripts run in the current directory and record all new files there. For example, a universal download script might download an image to the current directory:

```bash
cm run script --tags=download,file,_wget --url=https://cKnowledge.org/ai/data/computer_mouse.jpg --verify=no --env.CM_DOWNLOAD_CHECKSUM=45ae5c940233892c2f860efdf0b66e7e
```

To cache and reuse the output of scripts, CM offers a `cache` automation feature similar to `script`. When `"cache":true` is specified in a script's metadata, CM will create a `cache` directory in `$HOME/CM/repos/local` with a unique ID and the same tags as `script`, and execute the script there.

Subsequent executions of the same script will reuse files from the cache, avoiding redundancy. This is especially useful for large files or data sets.

You can manage cache entries and find specific ones using commands like:

```bash
cm show cache
cm show cache --tags=get,ml-model,resnet50,_onnx
cm find cache --tags=download,file,ml-model,resnet50,_onnx
cm info cache --tags=download,file,ml-model,resnet50,_onnx
```

To clean cache entries:

```bash
cm rm cache --tags=ml-model,resnet50
cm rm cache -f # Clean all entries
```

You can completely reset the CM framework by removing the `$HOME/CM` directory, which deletes all downloaded repositories and cached entries.

## Integration with Containers

CM scripts are designed to run natively or inside containers with the same commands. You can substitute `cm run script` with `cm docker script` to execute a script inside an automatically-generated container:

```bash
cm docker script --tags=python,app,image-classification,onnx,_cpu
```

CM automatically handles the generation of Dockerfiles, building of containers, and execution within containers, providing a seamless experience whether running scripts natively or in containers.

This approach simplifies the development process by eliminating the need for separate Dockerfile maintenance and allows for the use of native scripts and workflows directly within containers.
6 changes: 3 additions & 3 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Please check the [CM documentation](https://docs.mlcommons.org/ck) for more deta

See the [automatically generated catalog](scripts/index.md) of all CM scripts from MLCommons.

## Getting started with CM scripts
## Understanding CM scripts

* A CM script is identified by a set of tags and by unique ID.
* Further each CM script can have multiple variations and they are identified by variation tags which are treated in the same way as tags and identified by a `_` prefix.
Expand Down Expand Up @@ -69,8 +69,8 @@ Sometimes it is difficult to add all variations needed for a script like say `ba
* By using `--new` input, a new cache entry can be forced even when an old one exist.
* By default no depndencies are run for a cached entry unless `dynamic` key is set for it.

### Updating ENV from inside the run script
* [TBD]

Please see [here](getting-started.md) for trying CM scripts.



Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ theme:
- toc.follow
nav:
- HOME: index.md
- Getting Started: getting-started.md
- CM Scripts:
- scripts/index.md
- Python-automation: scripts/Python-automation/index.md
Expand Down
2 changes: 1 addition & 1 deletion script/app-image-classification-onnx-py/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ input_description:
docker:
skip_run_cmd: 'no'
skip_cm_sys_upgrade: 'yes'
cm_repo_flags: '--checkout=dev'
cm_repo_flags: '--branch=dev'
use_host_group_id: 'yes'
image_tag_extra: '-cm-dev'
input_paths:
Expand Down
3 changes: 3 additions & 0 deletions script/app-mlperf-inference-mlcommons-python/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -995,6 +995,9 @@ variations:
- tags: get,generic-python-lib,_package.more-itertools
names:
- more-itertools
- tags: get,generic-python-lib,_package.compressed_tensors
names:
- compressed_tensors

llama2-70b-99:
group: models
Expand Down
3 changes: 2 additions & 1 deletion script/app-mlperf-inference-mlcommons-python/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,11 +300,12 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio
" --dataset-path " + env['CM_DATASET_PATH_ROOT'] + \
' --dtype ' + env['CM_MLPERF_MODEL_PRECISION'].replace("bfloat", "bf").replace("float", "fp") + \
" --device " + device + \
" --max-batchsize " + max_batchsize + \
env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] + \
scenario_extra_options + mode_extra_options + \
" --output " + env['CM_MLPERF_OUTPUT_DIR'] + \
" --model-path " + env['CM_ML_MODEL_PATH']
if "--max-batchsize" not in cmd:
cmd += " --max-batchsize " + max_batchsize
if env.get('CM_COCO2014_SAMPLE_ID_PATH','') != '':
cmd += " --ids-path " + env['CM_COCO2014_SAMPLE_ID_PATH']

Expand Down
25 changes: 15 additions & 10 deletions script/app-mlperf-inference-nvidia/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,6 @@ variations:
CM_ML_MODEL_WEIGHTS_DATA_TYPE: int8
deps:
- tags: get,generic-python-lib,_Pillow
- tags: get,generic-python-lib,_torch
- tags: get,generic-python-lib,_torchvision
- tags: get,generic-python-lib,_opencv-python
- tags: get,generic-python-lib,_numpy
- tags: get,generic-python-lib,_pycocotools
Expand Down Expand Up @@ -527,6 +525,7 @@ variations:
- tags: get,generic-python-lib,_transformers
- tags: get,generic-python-lib,_safetensors
- tags: get,generic-python-lib,_onnx
- tags: get,generic-python-lib,_package.sympy
- tags: get,generic-python-lib,_onnx-graphsurgeon

bert-99:
Expand Down Expand Up @@ -594,10 +593,10 @@ variations:
CM_ML_MODEL_WEIGHTS_DATA_TYPE: fp16
deps:
- tags: get,generic-python-lib,_toml
- tags: get,generic-python-lib,_torchvision
- tags: get,generic-python-lib,_torchvision_cuda
names:
- torchvision
- tags: get,generic-python-lib,_torch
- tags: get,generic-python-lib,_torch_cuda
- tags: get,generic-python-lib,_nvidia-apex
- tags: get,generic-python-lib,_unidecode
- tags: get,generic-python-lib,_inflect
Expand All @@ -613,7 +612,6 @@ variations:
- CM_DLRM_V2_AGGREGATION_TRACE_FILE_PATH
deps:
- tags: get,dlrm,data,mlperf,inference,_nvidia
- tags: get,generic-python-lib,_torch
- tags: get,generic-python-lib,_package.torchsnapshot
- tags: get,generic-python-lib,_package.torchrec
version: 0.3.2
Expand Down Expand Up @@ -762,6 +760,11 @@ variations:
CM_MLPERF_NVIDIA_HARNESS_USE_TRITON: "yes"
CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX3: "using_triton"

use-graphs:
group: graphs
env:
CM_MLPERF_NVIDIA_HARNESS_USE_GRAPHS: "yes"

prebuild:
group: run-mode
env:
Expand Down Expand Up @@ -1106,7 +1109,8 @@ variations:

singlestream,resnet50:
env:
SKIP_POLICIES: '1'
CM_MLPERF_NVIDIA_HARNESS_DISABLE_BETA1_SMALLK: yes
SKIP_POLICIES: '0' # skip_policies used to give better latency but is not working with 4.0 and later Nvidia codes

server,resnet50:
env:
Expand All @@ -1118,7 +1122,8 @@ variations:

multistream,resnet50:
env:
SKIP_POLICIES: '1'
CM_MLPERF_NVIDIA_HARNESS_DISABLE_BETA1_SMALLK: yes
SKIP_POLICIES: '0'

singlestream,run_harness:
default_variations:
Expand Down Expand Up @@ -1435,12 +1440,12 @@ variations:
rtx_4090,sdxl,offline,run_harness:
default_variations:
batch-size: batch_size.2
use_graphs: "True"
graphs: use-graphs

rtx_4090,sdxl,server,run_harness:
default_variations:
batch-size: batch_size.2
use_graphs: "True"
graphs: use-graphs

rtx_4090,resnet50,offline,run_harness:
default_variations:
Expand All @@ -1449,7 +1454,7 @@ variations:
rtx_4090,resnet50,server,run_harness:
default_variations:
batch-size: batch_size.32
use_graphs: "True"
graphs: use-graphs

rtx_4090,retinanet,offline,run_harness:
default_variations:
Expand Down
Loading

0 comments on commit 5e9c03c

Please sign in to comment.