Skip to content

Commit

Permalink
Merge pull request GATEOverflow#136 from mlcommons/mlperf-inference
Browse files Browse the repository at this point in the history
MLPerf inference scripts 20240722
  • Loading branch information
ctuning-admin authored Jul 22, 2024
2 parents 9335505 + ef2193e commit 6267522
Show file tree
Hide file tree
Showing 16 changed files with 760 additions and 52 deletions.
4 changes: 4 additions & 0 deletions automation/script/module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,10 @@ def doc(i):
r = utils.save_txt(output_file, s)
if r['return']>0: return r

out_docs_file = os.path.join("..", "docs", "scripts", category, alias, "index.md")
r = utils.save_txt(out_docs_file, s)
if r['return']>0: return r

return {'return':0}


Expand Down
11 changes: 6 additions & 5 deletions script/app-mlperf-inference-mlcommons-python/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ new_env_keys:
- CM_HW_NAME
- CM_ML_MODEL_*
- CM_MAX_EXAMPLES

- CM_VLLM_*
new_state_keys:
- mlperf-inference-implementation
- CM_SUT_*
Expand Down Expand Up @@ -403,9 +403,11 @@ deps:
CM_MODEL:
- llama2-70b-99
- llama2-70b-99.9
skip_if_env:
skip_if_any_env:
CM_MLPERF_CUSTOM_MODEL_PATH:
- "on"
- "on"
CM_MLPERF_INFERENCE_API_SERVER:
- "on"

## mixtral-8x7b
- tags: get,ml-model,mixtral
Expand Down Expand Up @@ -517,7 +519,7 @@ deps:
- stable-diffusion-xl

## OpenOrca for LLAMA2-70b
- tags: get,preprocessed,dataset,openorca,_validation
- tags: get,preprocessed,dataset,openorca,_validation,_mlcommons
names:
- openorca-preprocessed
enable_if_env:
Expand Down Expand Up @@ -849,7 +851,6 @@ variations:
CM_MLPERF_MODEL_SKIP_BATCHING: true
deps:
- tags: get,generic-python-lib,_package.pydantic
version_max: "1.10.9"
- tags: get,generic-python-lib,_tokenization
- tags: get,generic-python-lib,_six
- tags: get,generic-python-lib,_package.absl-py
Expand Down
17 changes: 14 additions & 3 deletions script/app-mlperf-inference-mlcommons-python/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def preprocess(i):
else:
env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] += " --mlperf_conf "+ x + env['CM_MLPERF_CONF'] + x

if env.get('CM_NETWORK_LOADGEN', '') != "lon":
if env.get('CM_NETWORK_LOADGEN', '') != "lon" and env.get('CM_MLPERF_INFERENCE_API_SERVER','')=='':
env['MODEL_DIR'] = env.get('CM_ML_MODEL_PATH')
if not env['MODEL_DIR']:
env['MODEL_DIR'] = os.path.dirname(env.get('CM_MLPERF_CUSTOM_MODEL_PATH', env.get('CM_ML_MODEL_FILE_WITH_PATH')))
Expand Down Expand Up @@ -297,15 +297,26 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio
env['RUN_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "language", "llama2-70b")
backend = env['CM_MLPERF_BACKEND']
device = env['CM_MLPERF_DEVICE'] if env['CM_MLPERF_DEVICE'] != "gpu" else "cuda"

cmd = env['CM_PYTHON_BIN_WITH_PATH'] + " main.py " \
" --scenario " + env['CM_MLPERF_LOADGEN_SCENARIO'] + \
" --dataset-path " + env['CM_DATASET_PREPROCESSED_PATH'] + \
" --device " + device.replace("cuda", "cuda:0") + \
env['CM_MLPERF_LOADGEN_EXTRA_OPTIONS'] + \
scenario_extra_options + mode_extra_options + \
" --output-log-dir " + env['CM_MLPERF_OUTPUT_DIR'] + \
' --dtype ' + env['CM_MLPERF_MODEL_PRECISION'] + \
" --model-path " + env['MODEL_DIR']
' --dtype ' + env['CM_MLPERF_MODEL_PRECISION']

if env.get('CM_MLPERF_INFERENCE_API_SERVER', '') != '':
env['CM_VLLM_SERVER_MODEL_NAME'] = env.get("CM_VLLM_SERVER_MODEL_NAME") or "NousResearch/Meta-Llama-3-8B-Instruct"
#env['CM_MLPERF_INFERENCE_API_SERVER'] = "http://localhost:8000"
cmd += f" --api-server {env['CM_MLPERF_INFERENCE_API_SERVER']} --model-path {env['CM_VLLM_SERVER_MODEL_NAME']} --api-model-name {env['CM_VLLM_SERVER_MODEL_NAME']} --vllm "
else:
cmd += f" --model-path {env['MODEL_DIR']}"

if env.get('CM_MLPERF_INFERENCE_NUM_WORKERS', '') != '':
cmd += f" --num-workers {env['CM_MLPERF_INFERENCE_NUM_WORKERS']}"

cmd = cmd.replace("--count", "--total-sample-count")

elif "mixtral-8x7b" in env['CM_MODEL']:
Expand Down
21 changes: 14 additions & 7 deletions script/app-mlperf-inference-redhat/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,14 @@ deps:

- tags: get,git,repo
names:
inference-results
inference-code
updats_tags_from_env_with_prefix:
_repo.: CM_MLPERF_INFERENCE_RESULTS_REPO
- inference-results
- inference-code
update_tags_from_env_with_prefix:
_repo.:
- CM_MLPERF_INFERENCE_RESULTS_REPO
env:
CM_GIT_CHECKOUT_PATH_ENV_NAME: CM_MLPERF_INFERENCE_IMPLEMENTATION_REPO
extra_cache_tags: inference-implementation,mlperf
extra_cache_tags: results,repo,mlperf

# Post dependencies to run this app including for power measurement
post_deps:
Expand Down Expand Up @@ -241,7 +242,12 @@ variations:
CM_MODEL: gptj-99.9

llama2-70b_:
{}
deps:
- tags: get,dataset,openorca,language-processing,original,_redhat
env:
CM_MLPERF_IMPLEMENTATION: redhat
env:
CM_VLLM_SERVER_MODEL_NAME: NousResearch/Meta-Llama-3-8B-Instruct # assigned just for testing purpose

llama2-70b-99:
group: model
Expand Down Expand Up @@ -292,10 +298,11 @@ variations:
fp32:
group: precision

r4.0_default:
r4.1-dev_default:
group: version
default: true
env:
CM_MLPERF_INFERENCE_RESULTS_REPO: https://github.com/mlcommons/inference_results_v4.0

docker:
real_run: False
29 changes: 28 additions & 1 deletion script/app-mlperf-inference-redhat/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@ def preprocess(i):
run_dir = r ['run_dir']
print(run_cmd)
print(run_dir)
return {'return':1, 'error': 'Run command needs to be tested'}
env['CM_MLPERF_RUN_CMD'] = run_cmd
env['CM_RUN_DIR'] = run_dir
env['CM_RUN_CMD'] = run_cmd

return {'return':0}
#return {'return':1, 'error': 'Run command needs to be tested'}

def get_run_cmd(model, i):
env = i['env']
Expand All @@ -52,6 +57,28 @@ def get_run_cmd(model, i):
run_dir = os.path.join(env['CM_MLPERF_INFERENCE_IMPLEMENTATION_REPO'], "open", submitter, "code", "gptj-99")

return {'return': 0, 'run_cmd': run_cmd, 'run_dir': run_dir}

if "llama2" in model:
scenario = env['CM_MLPERF_LOADGEN_SCENARIO']
device = env['CM_MLPERF_DEVICE']
mode = env['CM_MLPERF_LOADGEN_MODE']
outdir = env['CM_MLPERF_OUTPUT_DIR']
mlperf_conf_path = env['CM_MLPERF_CONF']
user_conf_path = env['CM_MLPERF_USER_CONF']
api_server = env.get('CM_MLPERF_INFERENCE_API_SERVER', 'localhost:8000/v1')
api_model_name = env['CM_VLLM_SERVER_MODEL_NAME']
dataset_path = env['CM_DATASET_OPENORCA_PATH']
precision = env['CM_MLPERF_MODEL_PRECISION']
if mode == "accuracy":
accuracy_string = " --accuracy "
else:
accuracy_string = ""

run_cmd = f"python3 -u 'main.py' --scenario {scenario} --model-path {api_model_name} --api-model-name {api_model_name} --api-server {api_server} --mlperf-conf {mlperf_conf_path} {accuracy_string} --vllm --user-conf {user_conf_path} --dataset-path {dataset_path} --output-log-dir {outdir} --dtype float32 --device {device} "
submitter = "RedHat-Supermicro"
run_dir = os.path.join(env['CM_MLPERF_INFERENCE_IMPLEMENTATION_REPO'], "open", submitter, "code", model)

return {'return': 0, 'run_cmd': run_cmd, 'run_dir': run_dir}

def postprocess(i):

Expand Down
12 changes: 8 additions & 4 deletions script/generate-mlperf-inference-user-conf/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ def preprocess(i):
ranging_user_conf += ml_model_name + "." + scenario + ".min_duration = 300000" + "\n"

if env['CM_MLPERF_RUN_STYLE'] == "test":
max_duration_test = env.get('CM_MLPERF_MAX_DURATION_TEST', 30000)
max_duration_test_s = int(env.get('CM_MLPERF_MAX_DURATION_TEST', 30))
max_duration_test = str(max_duration_test_s * 1000) # in milliseconds
query_count = env.get('CM_TEST_QUERY_COUNT', "5")
user_conf += ml_model_name + "." + scenario + ".max_query_count = " + query_count + "\n"
user_conf += ml_model_name + "." + scenario + ".min_query_count = " + query_count + "\n"
Expand All @@ -271,7 +272,8 @@ def preprocess(i):

elif env['CM_MLPERF_RUN_STYLE'] == "fast":
user_conf += ml_model_name + "." + scenario + ".sample_concatenate_permutation = 0" + "\n"
max_duration_fast = env.get('CM_MLPERF_MAX_DURATION_FAST', 120000)
max_duration_fast_s = int(env.get('CM_MLPERF_MAX_DURATION_FAST', 120))
max_duration_fast = str(max_duration_fast_s * 1000) # in milliseconds
if scenario == "Server":
user_conf += ml_model_name + "." + scenario + f".max_duration = {max_duration_fast}" + "\n"
target_qps = conf['target_qps']
Expand All @@ -280,8 +282,10 @@ def preprocess(i):
env['CM_MLPERF_MAX_QUERY_COUNT'] = query_count

else:
max_duration_valid = env.get('CM_MLPERF_MAX_DURATION_VALID', 660000)
max_duration_ranging = env.get('CM_MLPERF_MAX_DURATION_RANGING', 300000)
max_duration_valid_s = int(env.get('CM_MLPERF_MAX_DURATION_VALID', 660))
max_duration_valid = str(max_duration_valid_s * 1000) # in milliseconds
max_duration_ranging_s = int(env.get('CM_MLPERF_MAX_DURATION_RANGING', 300))
max_duration_ranging = str(max_duration_ranging_s * 1000) # in milliseconds
if scenario == "MultiStream" or scenario == "SingleStream":
if env.get('CM_MLPERF_USE_MAX_DURATION', 'yes').lower() not in [ "no", "false", "0" ] and env.get('CM_MLPERF_MODEL_EQUAL_ISSUE_MODE', 'no').lower() not in [ "yes", "1", "true" ]:
user_conf += ml_model_name + "." + scenario + f".max_duration = {max_duration_valid}" + "\n"
Expand Down
3 changes: 2 additions & 1 deletion script/get-mlperf-inference-utils/mlperf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,9 @@ def get_result_string(version, model, scenario, result_path, has_power, sub_res,
return result_string, result

def get_result_table(results):


headers = ["Model", "Scenario", "Accuracy", "QPS", "Latency (in ms)", "Power Efficiency (in samples/J)", "TEST01", "TEST05", "TEST04"]
headers = ["Model", "Scenario", "Accuracy", "Throughput", "Latency (in ms)", "Power Efficiency (in samples/J)", "TEST01", "TEST05", "TEST04"]
table = []
for model in results:
for scenario in results[model]:
Expand Down
Loading

0 comments on commit 6267522

Please sign in to comment.