Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dev <- Mlperf inference #256

Merged
merged 25 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9afe484
CM_TMP_CURRENT_SCRIPT_PATH made accessible while updating dynamic env
anandhu-eng Sep 16, 2024
4128909
Merge branch 'mlperf-inference' into nvidia-sdxl-v4.1
arjunsuresh Sep 16, 2024
3c17f53
Merge branch 'mlperf-inference' into nvidia-sdxl-v4.1
arjunsuresh Sep 16, 2024
31635dd
fixed rclone cmd bug
anandhu-eng Sep 17, 2024
29054cb
Included abstract class for updating env variables
anandhu-eng Sep 17, 2024
8503492
Merge branch 'mlperf-inference' into nvidia-sdxl-v4.1
anandhu-eng Sep 17, 2024
fd7aff8
modified updation of env variable throug abstract function
anandhu-eng Sep 17, 2024
4f6ac44
avoided unnecessary env updation
anandhu-eng Sep 17, 2024
57c5f24
Fixed intendations
anandhu-eng Sep 17, 2024
238f0ca
code clean
anandhu-eng Sep 18, 2024
d4505af
test commit-fix indent
anandhu-eng Sep 18, 2024
0f377b3
Revert "test commit-fix indent"
anandhu-eng Sep 18, 2024
ecdfd26
Merge branch 'mlperf-inference' into nvidia-sdxl-v4.1
anandhu-eng Sep 18, 2024
e18f64c
added error flag if no files are transfered
anandhu-eng Sep 18, 2024
7a08bfe
remove file only if PRE_DOWNLOAD_CLEAN env is set
anandhu-eng Sep 18, 2024
444121d
Merge branch 'mlperf-inference' into vllm_branch
anandhu-eng Sep 18, 2024
19fe323
Merge pull request #113 from anandhu-eng/nvidia-sdxl-v4.1
arjunsuresh Sep 18, 2024
ab28fa4
Merge pull request #114 from anandhu-eng/vllm_branch
arjunsuresh Sep 18, 2024
45b6981
Fix typo in Nvidia mlperf inference app
arjunsuresh Sep 18, 2024
18a55b8
Added get-rclone-config script for mlc-inference #172
arjunsuresh Sep 18, 2024
96a5b2d
Uses get-rclone-config script for MLC-inference configs
arjunsuresh Sep 18, 2024
db1da06
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 18, 2024
f724f59
Uses get-rclone-config script for MLC-inference configs
arjunsuresh Sep 18, 2024
6c85bc2
Reduced the test_query_count for ABTF gh action
arjunsuresh Sep 18, 2024
3ac6245
Merge pull request #255 from GATEOverflow/mlperf-inference
arjunsuresh Sep 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/test-mlperf-inference-abtf-poc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
cm pull repo mlcommons@cm4abtf --branch=poc
- name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on docker
run: |
cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=5 --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet -v
cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=2 --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet -v

build2:
runs-on: ${{ matrix.os }}
Expand All @@ -62,7 +62,7 @@ jobs:
cm pull repo mlcommons@cm4abtf --branch=poc
- name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on ${{ matrix.os }}
run: |
cm run script --tags=run-abtf,inference,_poc-demo --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet -v
cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=2 --adr.compiler.tags=gcc --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet -v

build3:
runs-on: ${{ matrix.os }}
Expand All @@ -89,4 +89,4 @@ jobs:
cm pull repo mlcommons@cm4abtf --branch=poc
- name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on ${{ matrix.os }}
run: |
cm run script --tags=run-abtf,inference,_poc-demo --quiet --env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 -v
cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=2 --quiet --env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 -v
53 changes: 39 additions & 14 deletions automation/script/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,15 +351,16 @@ def _run(self, i):

debug_uid = i.get('debug_uid', '')
if debug_uid!='':
env['CM_TMP_DEBUG_UID'] = debug_uid
r = _update_env(env, 'CM_TMP_DEBUG_UID', debug_uid)
if r['return']>0: return r

fake_deps = i.get('fake_deps', False)
if fake_deps: env['CM_TMP_FAKE_DEPS']='yes'

if str(i.get('skip_sys_utils', '')).lower() in ['true', 'yes']:
env['CM_SKIP_SYS_UTILS']='yes'
env['CM_SKIP_SYS_UTILS']='yes'
if str(i.get('skip_sudo', '')).lower() in ['true', 'yes']:
env['CM_TMP_SKIP_SUDO']='yes'
env['CM_TMP_SKIP_SUDO']='yes'

run_state = i.get('run_state', self.run_state)
if not run_state.get('version_info', []):
Expand Down Expand Up @@ -387,9 +388,9 @@ def _run(self, i):
elif 'v' in i: verbose=i['v']

if verbose:
env['CM_VERBOSE']='yes'
run_state['tmp_verbose']=True
logging.getLogger().setLevel(logging.DEBUG)
env['CM_VERBOSE']='yes'
run_state['tmp_verbose']=True
logging.getLogger().setLevel(logging.DEBUG)


print_deps = i.get('print_deps', False)
Expand Down Expand Up @@ -418,7 +419,8 @@ def _run(self, i):

# Detect current path and record in env for further use in native scripts
current_path = os.path.abspath(os.getcwd())
env['CM_TMP_CURRENT_PATH'] = current_path
r = _update_env(env, 'CM_TMP_CURRENT_PATH', current_path)
if r['return']>0: return r

# Check if quiet mode
quiet = i.get('quiet', False) if 'quiet' in i else (env.get('CM_QUIET','').lower() == 'yes')
Expand Down Expand Up @@ -472,6 +474,9 @@ def _run(self, i):
if value != '':
env['CM_' + key.upper()] = value

r = update_env_with_values(env)
if r['return']>0: return r


############################################################################################################
# Check if we want to skip cache (either by skip_cache or by fake_run)
Expand Down Expand Up @@ -1317,7 +1322,8 @@ def _run(self, i):

logging.debug(recursion_spaces+' - Version is not specified - use either default_version from meta or min/max/usable: {}'.format(version))

env['CM_VERSION'] = version
r = _update_env(env, 'CM_VERSION', version)
if r['return']>0: return r

if 'version-'+version not in cached_tags: cached_tags.append('version-'+version)

Expand All @@ -1329,8 +1335,9 @@ def _run(self, i):
if "add_deps_recursive" in versions_meta:
self._merge_dicts_with_tags(add_deps_recursive, versions_meta['add_deps_recursive'])

env['CM_TMP_CURRENT_SCRIPT_PATH'] = path

r = _update_env(env, 'CM_TMP_CURRENT_SCRIPT_PATH', path)
if r['return']>0: return r

# Run chain of docker dependencies if current run cmd is from inside a docker container
docker_deps = []
if i.get('docker_run_deps'):
Expand Down Expand Up @@ -1444,7 +1451,8 @@ def _run(self, i):
elif pip_version_max != '':
pip_version_string = '<='+pip_version_max

env['CM_TMP_PIP_VERSION_STRING'] = pip_version_string
r = _update_env(env, 'CM_TMP_PIP_VERSION_STRING', pip_version_string)
if r['return']>0: return r
if pip_version_string != '':
logging.debug(recursion_spaces+' # potential PIP version string (if needed): '+pip_version_string)

Expand Down Expand Up @@ -4359,6 +4367,20 @@ def any_enable_or_skip_script(meta, env):

return False

############################################################################################################
def _update_env(env, key=None, value=None):
if key == None or value == None:
return {'return': 1, 'error': 'None value not expected in key and value arguments in _update_env.'}
if not isinstance(key, str):
return {'return': 1, 'error': 'String value expected inside key argument.'}

env[key] = value

r = update_env_with_values(env)
if r['return']>0: return r

return {'return': 0}

############################################################################################################
def update_env_with_values(env, fail_on_not_found=False, extra_env={}):
"""
Expand Down Expand Up @@ -4532,9 +4554,12 @@ def prepare_and_run_script_with_postprocessing(i, postprocess="postprocess"):
path = '"' + path + '"'

cur_dir = os.getcwd()

env['CM_TMP_CURRENT_SCRIPT_PATH'] = path
env['CM_TMP_CURRENT_SCRIPT_WORK_PATH'] = cur_dir

r = _update_env(env, 'CM_TMP_CURRENT_SCRIPT_PATH', path)
if r['return']>0: return r

r = _update_env(env, 'CM_TMP_CURRENT_SCRIPT_WORK_PATH', cur_dir)
if r['return']>0: return r

# Record state
if tmp_file_state != '':
Expand Down
2 changes: 1 addition & 1 deletion script/app-mlperf-inference-nvidia/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ variations:
CM_MLPERF_NVIDIA_HARNESS_MAXN: yes

preprocess-data:
alias: preprocess-data
alias: preprocess_data

preprocess_data:
group: run-mode
Expand Down
13 changes: 13 additions & 0 deletions script/download-file/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,19 @@
"deps": [
{
"tags": "get,rclone"
},
{
"tags": "get,rclone-config",
"update_tags_from_env_with_prefix": {
"_": [
"CM_RCLONE_CONFIG_NAME"
]
},
"enable_if_env": {
"CM_RCLONE_CONFIG_NAME": [
"on"
]
}
}
],
"env": {
Expand Down
16 changes: 3 additions & 13 deletions script/download-file/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def preprocess(i):
env['CM_DOWNLOAD_CMD'] += f" || ((rm -f {env['CM_DOWNLOAD_FILENAME']} || true) && gdown {extra_download_options} {url})"

elif tool == "rclone":
if env.get('CM_RCLONE_CONFIG_CMD', '') != '':
if env.get('CM_RCLONE_CONFIG_CMD', '') != '': #keeping this for backward compatibility. Ideally should be done via get,rclone-config script
env['CM_DOWNLOAD_CONFIG_CMD'] = env['CM_RCLONE_CONFIG_CMD']
rclone_copy_using = env.get('CM_RCLONE_COPY_USING', 'sync')
if rclone_copy_using == "sync":
Expand All @@ -168,19 +168,9 @@ def preprocess(i):
# have to modify the variable from url to temp_url if it is going to be used anywhere after this point
url = url.replace("%", "%%")
temp_download_file = env['CM_DOWNLOAD_FILENAME'].replace("%", "%%")
env['CM_DOWNLOAD_CMD'] = f"rclone {rclone_copy_using} {q}{url}{q} {q}{os.path.join(os.getcwd(), temp_download_file)}{q} -P"
env['CM_DOWNLOAD_CMD'] = f"rclone {rclone_copy_using} {q}{url}{q} {q}{os.path.join(os.getcwd(), temp_download_file)}{q} -P --error-on-no-transfer"
else:
env['CM_DOWNLOAD_CMD'] = f"rclone {rclone_copy_using} {q}{url}{q} {q}{os.path.join(os.getcwd(), env['CM_DOWNLOAD_FILENAME'])}{q} -P"
for i in range(1,5):
url = env.get('CM_DOWNLOAD_URL'+str(i),'')
if url == '':
break
if env["CM_HOST_OS_TYPE"] == "windows":
url = url.replace("%", "%%")
temp_download_file = env['CM_DOWNLOAD_FILENAME'].replace("%", "%%")
env['CM_DOWNLOAD_CMD'] = f" || ((rm -f {env['CM_DOWNLOAD_FILENAME']} || true) && rclone {rclone_copy_using} {q}{url}{q} {q}{os.path.join(os.getcwd(), temp_download_file)}{q} -P)"
else:
env['CM_DOWNLOAD_CMD'] = f" || ((rm -f {env['CM_DOWNLOAD_FILENAME']} || true) && rclone {rclone_copy_using} {q}{url}{q} {q}{os.path.join(os.getcwd(), env['CM_DOWNLOAD_FILENAME'])}{q} -P"
env['CM_DOWNLOAD_CMD'] = f"rclone {rclone_copy_using} {q}{url}{q} {q}{os.path.join(os.getcwd(), env['CM_DOWNLOAD_FILENAME'])}{q} -P --error-on-no-transfer"

filename = env['CM_DOWNLOAD_FILENAME']
env['CM_DOWNLOAD_DOWNLOADED_FILENAME'] = filename
Expand Down
6 changes: 4 additions & 2 deletions script/download-file/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,10 @@ fi

if [[ ${require_download} == "1" ]]; then
echo ""
echo ${CM_PRE_DOWNLOAD_CLEAN_CMD}
${CM_PRE_DOWNLOAD_CLEAN_CMD}
if [ -e "${CM_PRE_DOWNLOAD_CLEAN}" ]; then
echo ${CM_PRE_DOWNLOAD_CLEAN_CMD}
${CM_PRE_DOWNLOAD_CLEAN_CMD}
fi

echo ""
echo "${CM_DOWNLOAD_CMD}"
Expand Down
2 changes: 1 addition & 1 deletion script/get-ml-model-dlrm-terabyte/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
},
"pytorch,fp32,weight_sharded,rclone": {
"env": {
"CM_RCLONE_CONFIG_CMD": "rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com",
"CM_RCLONE_CONFIG_NAME": "mlc-inference",
"CM_PACKAGE_URL": "mlc-inference:mlcommons-inference-wg-public/model_weights"
}
},
Expand Down
2 changes: 1 addition & 1 deletion script/get-ml-model-gptj/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
"CM_UNZIP": "yes",
"CM_DOWNLOAD_CHECKSUM_NOT_USED": "e677e28aaf03da84584bb3073b7ee315",
"CM_PACKAGE_URL": "https://cloud.mlcommons.org/index.php/s/QAZ2oM94MkFtbQx/download",
"CM_RCLONE_CONFIG_CMD": "rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com",
"CM_RCLONE_CONFIG_NAME": "mlc-inference",
"CM_RCLONE_URL": "mlc-inference:mlcommons-inference-wg-public/gpt-j"
},
"required_disk_space": 22700
Expand Down
2 changes: 1 addition & 1 deletion script/get-ml-model-stable-diffusion/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
"rclone": {
"group": "download-tool",
"env": {
"CM_RCLONE_CONFIG_CMD": "rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com",
"CM_RCLONE_CONFIG_NAME": "mlc-inference",
"CM_DOWNLOAD_TOOL": "rclone"
},
"adr": {
Expand Down
2 changes: 1 addition & 1 deletion script/get-preprocessed-dataset-criteo/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@
],
"extra_cache_tags": "criteo,preprocessed,dataset",
"env": {
"CM_RCLONE_CONFIG_CMD": "rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com",
"CM_RCLONE_CONFIG_NAME": "mlc-inference",
"CM_RCLONE_URL": "mlc-inference:mlcommons-inference-wg-public/dlrm_preprocessed",
"CM_DOWNLOAD_FINAL_ENV_NAME": "CM_DATASET_PREPROCESSED_PATH",
"CM_EXTRACT_FINAL_ENV_NAME": "CM_DATASET_PREPROCESSED_PATH",
Expand Down
4 changes: 2 additions & 2 deletions script/get-preprocessed-dataset-openorca/_cm.json
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,15 @@
"mlcommons": {
"env": {
"CM_DATASET_PREPROCESSED_BY_MLC": "yes",
"CM_RCLONE_CONFIG_CMD": "rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com",
"CM_RCLONE_URL": "mlc-inference:mlcommons-inference-wg-public/open_orca"
},
"deps": [
{
"env": {
"CM_DOWNLOAD_FINAL_ENV_NAME": "CM_OPENORCA_PREPROCESSED_ROOT",
"CM_EXTRACT_FINAL_ENV_NAME": "CM_OPENORCA_PREPROCESSED_ROOT",
"CM_EXTRACT_TO_FOLDER": "openorca-preprocessed"
"CM_EXTRACT_TO_FOLDER": "openorca-preprocessed",
"CM_RCLONE_CONFIG_NAME": "mlc-inference"
},
"tags": "download-and-extract,_rclone",
"update_tags_from_env_with_prefix": {
Expand Down
13 changes: 13 additions & 0 deletions script/get-rclone-config/_cm.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
alias: get-rclone-config
automation_alias: script
automation_uid: 5b4e0237da074764
cache: false #keeping cache off as rerunning the command is safe
can_force_cache: true
tags:
- get
- rclone-config
uid: 6c59ddbc6cd046e3
variations:
mlc-inference:
env:
CM_RCLONE_CONFIG_CMD: 'rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5eef400db161ea49967de89f47b secret_access_key=fbea333914c292b854f14d3fe232bad6c5407bf0ab1bebf78833c2b359bdfd2b endpoint=https://c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com'
25 changes: 25 additions & 0 deletions script/get-rclone-config/customize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from cmind import utils
import os

def preprocess(i):

os_info = i['os_info']

env = i['env']

meta = i['meta']

automation = i['automation']

quiet = (env.get('CM_QUIET', False) == 'yes')

if env.get('CM_RCLONE_CONFIG_CMD', '') != '':
env['CM_RUN_CMD'] = env['CM_RCLONE_CONFIG_CMD']

return {'return':0}

def postprocess(i):

env = i['env']

return {'return':0}
1 change: 1 addition & 0 deletions script/get-rclone-config/run.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
rem native script
17 changes: 17 additions & 0 deletions script/get-rclone-config/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash

#CM Script location: ${CM_TMP_CURRENT_SCRIPT_PATH}

#To export any variable
#echo "VARIABLE_NAME=VARIABLE_VALUE" >>tmp-run-env.out

#${CM_PYTHON_BIN_WITH_PATH} contains the path to python binary if "get,python" is added as a dependency

echo "Running: "
echo "${CM_RUN_CMD}"
echo ""

if [[ ${CM_FAKE_RUN} != "yes" ]]; then
eval "${CM_RUN_CMD}"
test $? -eq 0 || exit 1
fi
Loading