From dcf869d2eff8d43db984ec9ecb08b8a5b9c1fb24 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Wed, 20 Mar 2024 12:39:22 +0100 Subject: [PATCH 01/12] started new dev version --- cm/cmind/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm/cmind/__init__.py b/cm/cmind/__init__.py index a6fb1ed29a..97782adc25 100644 --- a/cm/cmind/__init__.py +++ b/cm/cmind/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.0.2" +__version__ = "2.0.2.1" from cmind.core import access from cmind.core import error From 03377f3a5bcc886d841ca4bef2761f461e6b4a7e Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Wed, 20 Mar 2024 13:16:55 +0100 Subject: [PATCH 02/12] improved handling of broken CM repositories --- cm/cmind/repo.py | 3 ++- cm/cmind/repos.py | 60 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/cm/cmind/repo.py b/cm/cmind/repo.py index 9dc32ae7aa..e2230a5f1e 100644 --- a/cm/cmind/repo.py +++ b/cm/cmind/repo.py @@ -63,7 +63,8 @@ def load(self): r = utils.load_yaml_and_json(file_name_without_ext = full_path) if r['return'] >0: - r['error']='CM repository is broken ({})'.format(r['error']) + r['error'] = 'CM repository is broken ({})'.format(r['error']) + r['return'] = 16 return r self.meta = r['meta'] diff --git a/cm/cmind/repos.py b/cm/cmind/repos.py index fa064aba4f..890864f7e6 100644 --- a/cm/cmind/repos.py +++ b/cm/cmind/repos.py @@ -127,21 +127,23 @@ def load(self, init = False): repo = Repo(full_path_to_repo, self.cfg) r = repo.load() - if r['return']>0: return r + if r['return']>0 and r['return']!=16: return r + + # Load only if desc exists + if r['return']!=16: + # Set only after all initializations + self.lst.append(repo) - # Set only after all initializations - self.lst.append(repo) - - repo_uid = repo.meta['uid'] - if repo_uid!='': - self.extra_info[repo_uid]=repo + repo_uid = repo.meta['uid'] + if repo_uid!='': + self.extra_info[repo_uid]=repo - repo_alias = repo.meta['alias'] - if repo_alias!='': - self.extra_info[repo_alias]=repo + repo_alias = repo.meta['alias'] + if repo_alias!='': + self.extra_info[repo_alias]=repo - found = True - break + found = True + break # Repo path exists but repo itself doesn't exist - fail if found: @@ -151,14 +153,18 @@ def load(self, init = False): # Save with correct paths if len(checked_self_paths)!=len(self.paths): - self.paths = checked_self_paths - + import copy + + self.paths = copy.deepcopy(checked_self_paths) + + if self.path_to_internal_repo in checked_self_paths: + checked_self_paths.remove(self.path_to_internal_repo) + print ('WARNING: fixed repo list file {}'.format(full_path_to_repo_paths)) - r = utils.save_json(full_path_to_repo_paths, meta = self.paths) + r = utils.save_json(full_path_to_repo_paths, meta = checked_self_paths) if r['return']>0: return r - return {'return':0} ############################################################ @@ -279,6 +285,25 @@ def pull(self, alias, url = '', branch = '', checkout = '', console = False, des print ('Local path: '+path_to_repo) print ('') + # Check if repository already exists but corrupted + path_to_repo_desc = os.path.join(path_to_repo, self.cfg['file_meta_repo']) + r=utils.is_file_json_or_yaml(file_name = path_to_repo_desc) + if r['return']>0: return r + repo_desc_exists=r['is_file'] + + if os.path.isdir(path_to_repo) and not repo_desc_exists: + print ('') + print ('WARNING: directory {} already exists but without cmr.yaml - maybe clone or download was corrupted!'.format(path_to_repo)) + + x = input('Delete this repo (Y/n)? ') + if x.strip().lower() not in ['n','no']: + import shutil + + print ('') + print ('Deleting {} ...'.format(path_to_repo)) + shutil.rmtree(path_to_repo, onerror=rm_read_only) + print ('') + cur_dir = os.getcwd() clone=False @@ -286,6 +311,7 @@ def pull(self, alias, url = '', branch = '', checkout = '', console = False, des download=True if url.find('.zip')>0 else False if download: + # If CM repo already exists if os.path.isdir(path_to_repo): return {'return':1, 'error':'repository is already installed'} @@ -387,8 +413,6 @@ def pull(self, alias, url = '', branch = '', checkout = '', console = False, des return {'return':1, 'error':'git checkout for repository failed'} # Check if repo description exists - path_to_repo_desc = os.path.join(path_to_repo, self.cfg['file_meta_repo']) - r=utils.is_file_json_or_yaml(file_name = path_to_repo_desc) if r['return']>0: return r From 2be3bea75aa225ef53b5142d562325dceb2c283c Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Wed, 20 Mar 2024 13:38:08 +0100 Subject: [PATCH 03/12] added "cm import repo" --- cm/cmind/core.py | 5 ++++- cm/cmind/repo/automation/repo/_cm.json | 3 +++ cm/cmind/repo/automation/repo/module.py | 8 ++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cm/cmind/core.py b/cm/cmind/core.py index cbd2270e26..15b01d9b9b 100644 --- a/cm/cmind/core.py +++ b/cm/cmind/core.py @@ -354,6 +354,8 @@ def access(self, i, out = None): elif not utils.is_cm_uid(xuid): return {'return':1, 'error':'you must use CM UID after automation {} when using --common'.format(parsed_automation[0][0])} + automation_meta = {} + if automation != '' and not use_common_automation: # If wildcards in automation, use the common one (usually for search across different automations) # However, still need above "parse_automation" for proper search @@ -458,7 +460,8 @@ def access(self, i, out = None): if action in self.cfg['action_substitutions']: action = self.cfg['action_substitutions'][action] - + elif action in automation_meta.get('action_substitutions',{}): + action = automation_meta['action_substitutions'][action] # Check if common automation and --help if (use_common_automation or automation=='') and cm_help: diff --git a/cm/cmind/repo/automation/repo/_cm.json b/cm/cmind/repo/automation/repo/_cm.json index e5353321ee..6ba31783d8 100644 --- a/cm/cmind/repo/automation/repo/_cm.json +++ b/cm/cmind/repo/automation/repo/_cm.json @@ -1,4 +1,7 @@ { + "action_substitutions": { + "import":"ximport" + }, "alias": "repo", "automation_alias": "automation", "automation_uid": "bbeb15d8f0a944a4", diff --git a/cm/cmind/repo/automation/repo/module.py b/cm/cmind/repo/automation/repo/module.py index 907ea3a2d1..2485a418c8 100644 --- a/cm/cmind/repo/automation/repo/module.py +++ b/cm/cmind/repo/automation/repo/module.py @@ -414,6 +414,14 @@ def delete(self, i): return r + ############################################################ + def ximport(self, i): + + if i.get('path','')!='': + i['here']=True + + return self.init(i) + ############################################################ def init(self, i): """ From c6da531694d4eb3bcbcf97083138d273f1c3fea4 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Wed, 20 Mar 2024 13:51:54 +0100 Subject: [PATCH 04/12] added "cm checkout repo mlcommons@ck --branch=dev" --- cm/cmind/repo/automation/repo/module.py | 46 +++++++++++++++--- cm/cmind/repos.py | 63 +++++++++++++------------ 2 files changed, 73 insertions(+), 36 deletions(-) diff --git a/cm/cmind/repo/automation/repo/module.py b/cm/cmind/repo/automation/repo/module.py index 2485a418c8..ef856ba3dc 100644 --- a/cm/cmind/repo/automation/repo/module.py +++ b/cm/cmind/repo/automation/repo/module.py @@ -27,6 +27,7 @@ def pull(self, i): (pat) (str): Personal Access Token (if supported and url=='') (branch) (str): Git branch (checkout) (str): Git checkout + (checkout_only) (bool): only checkout existing repo (depth) (int): Git depth (desc) (str): brief repository description (1 line) (prefix) (str): extra directory to keep CM artifacts @@ -48,6 +49,8 @@ def pull(self, i): prefix = i.get('prefix','') pat = i.get('pat','') + checkout_only = i.get('checkout_only', False) + if url == '': if alias != '': url = self.cmind.cfg['repo_url_prefix'] @@ -80,7 +83,7 @@ def pull(self, i): if url == '': pull_repos = [] - + for repo in sorted(self.cmind.repos.lst, key = lambda x: x.meta.get('alias','')): meta = repo.meta @@ -89,7 +92,7 @@ def pull(self, i): # Pick it up from the path repo_path = repo.path - + pull_repos.append({'alias': os.path.basename(repo_path), 'path_to_repo': repo_path}) else: @@ -127,15 +130,23 @@ def pull(self, i): # Prepare path to repo repos = self.cmind.repos - - r = repos.pull(alias = alias, url = url, branch = branch, checkout = checkout, console = console, - desc=desc, prefix=prefix, depth=depth, path_to_repo=path_to_repo) + + r = repos.pull(alias = alias, + url = url, + branch = branch, + checkout = checkout, + console = console, + desc=desc, + prefix=prefix, + depth=depth, + path_to_repo=path_to_repo, + checkout_only=checkout_only) if r['return']>0: return r repo_meta = r['meta'] repo_metas[alias] = repo_meta - + if len(pull_repos)>0 and self.cmind.use_index: if console: print (self.cmind.cfg['line']) @@ -145,6 +156,28 @@ def pull(self, i): return {'return':0, 'meta':repo_meta, 'metas': repo_metas} + + + ############################################################ + def checkout(self, i): + """ + Checkout repository + + Args: + (branch) (str): branch name + (checkout) (str): checkout + + See "pull" action + + Returns: + See "pull" action + """ + + i['checkout_only'] = True + + return self.pull(i) + + ############################################################ def show(self, i): """ @@ -162,7 +195,6 @@ def show(self, i): return self.search(i) - ############################################################ def search(self, i): """ diff --git a/cm/cmind/repos.py b/cm/cmind/repos.py index 890864f7e6..144b0a7d96 100644 --- a/cm/cmind/repos.py +++ b/cm/cmind/repos.py @@ -252,7 +252,7 @@ def process(self, repo_path, mode='add'): return {'return':0} ############################################################ - def pull(self, alias, url = '', branch = '', checkout = '', console = False, desc = '', prefix = '', depth = None, path_to_repo = None): + def pull(self, alias, url = '', branch = '', checkout = '', console = False, desc = '', prefix = '', depth = None, path_to_repo = None, checkout_only = False): """ Clone or pull CM repository @@ -261,6 +261,7 @@ def pull(self, alias, url = '', branch = '', checkout = '', console = False, des (url) (str): Git repository URL (branch) (str): Git repository branch (checkout) (str): Git repository checkout + (checkout_only) (bool): only checkout existing repo (depth) (int): Git repository depth (console) (bool): if True, print some info to console (desc) (str): optional repository description @@ -310,47 +311,48 @@ def pull(self, alias, url = '', branch = '', checkout = '', console = False, des download=True if url.find('.zip')>0 else False - if download: - # If CM repo already exists - if os.path.isdir(path_to_repo): - return {'return':1, 'error':'repository is already installed'} + if not checkout_only: + if download: + # If CM repo already exists + if os.path.isdir(path_to_repo): + return {'return':1, 'error':'repository is already installed'} - os.makedirs(path_to_repo) - - os.chdir(path_to_repo) - - cmd = 'wget --no-check-certificate "'+url+'" -O '+alias + os.makedirs(path_to_repo) - else: - if os.path.isdir(path_to_repo): - # Attempt to update os.chdir(path_to_repo) - cmd = 'git pull' + cmd = 'wget --no-check-certificate "'+url+'" -O '+alias + else: - # Attempt to clone - clone = True + if os.path.isdir(path_to_repo): + # Attempt to update + os.chdir(path_to_repo) - os.chdir(self.full_path_to_repos) + cmd = 'git pull' + else: + # Attempt to clone + clone = True - cmd = 'git clone '+url+' '+alias + os.chdir(self.full_path_to_repos) - # Check if depth is set - if depth!=None and depth!='': - cmd+=' --depth '+str(depth) + cmd = 'git clone '+url+' '+alias - if console: - print (cmd) - print ('') + # Check if depth is set + if depth!=None and depth!='': + cmd+=' --depth '+str(depth) - r = os.system(cmd) + if console: + print (cmd) + print ('') - if clone and not os.path.isdir(path_to_repo): - return {'return':1, 'error':'repository was not cloned'} + r = os.system(cmd) + + if clone and not os.path.isdir(path_to_repo): + return {'return':1, 'error':'repository was not cloned'} os.chdir(path_to_repo) - if download: + if download and not checkout_only: import zipfile pack_file = os.path.join(path_to_repo, alias) @@ -396,8 +398,11 @@ def pull(self, alias, url = '', branch = '', checkout = '', console = False, des if branch != '' or checkout != '': cmd = 'git checkout' + # When checkout only, we do not need -b for branch + extra_flag = ' ' if checkout_only else ' -b ' + if branch != '': - cmd += ' -b ' + branch + cmd += extra_flag + branch if checkout!='': cmd += ' ' + checkout From 586413184e087d0e211a4a923bb35fa7d278cb2d Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Wed, 20 Mar 2024 13:54:06 +0100 Subject: [PATCH 05/12] V2.0.3 - added support to handle broken CM repositories: https://github.com/mlcommons/ck/issues/1177 - added "cm checkout repo mlcommons@ck --branch=dev" to make it easier to switch branches - added "cm import repo" to import repository in the current directory --- cm/CHANGES.md | 5 +++++ cm/cmind/__init__.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cm/CHANGES.md b/cm/CHANGES.md index 8202e9477c..bdec2698db 100644 --- a/cm/CHANGES.md +++ b/cm/CHANGES.md @@ -1,3 +1,8 @@ +## V2.0.3 + - added support to handle broken CM repositories: https://github.com/mlcommons/ck/issues/1177 + - added "cm checkout repo mlcommons@ck --branch=dev" to make it easier to switch branches + - added "cm import repo" to import repository in the current directory + ## V2.0.2 - added support to update all CM Git repos in one go: "cm pull repo" - added support to show extra info about CM Git repos: "cm show repo" diff --git a/cm/cmind/__init__.py b/cm/cmind/__init__.py index 97782adc25..2bedc3b205 100644 --- a/cm/cmind/__init__.py +++ b/cm/cmind/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.0.2.1" +__version__ = "2.0.3" from cmind.core import access from cmind.core import error From 10dbcb8e42213da7dfa826150acc4c10c4bc1593 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Wed, 20 Mar 2024 16:15:00 +0100 Subject: [PATCH 06/12] continue improving CM-MLPerf inference GUI --- .../google-tpu.json | 3 ++- .../habana-gaudi.json | 6 +++++ .../cfg/benchmark-list/mlperf-inference.yaml | 1 + cm-mlops/script/gui/playground_howtorun.py | 6 +++-- .../run-mlperf-inference-app/customize.py | 22 ++++++++++++++----- cmr.yaml | 2 +- docs/taskforce.md | 3 +++ 7 files changed, 34 insertions(+), 9 deletions(-) create mode 100644 cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json diff --git a/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json b/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json index dbcf9c70e9..43c0c6cb78 100644 --- a/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json +++ b/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json @@ -1,5 +1,6 @@ { "uid": "b3be7ac9ef954f5a", "name": "Google TPU", - "tags": "tpu,google" + "tags": "tpu,google", + "mlperf_inference_device": "tpu" } diff --git a/cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json b/cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json new file mode 100644 index 0000000000..c6784c0c07 --- /dev/null +++ b/cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json @@ -0,0 +1,6 @@ +{ + "uid": "a42388a2a8cd412c", + "name": "Habana Gauidi 2", + "tags": "gaudi,habana", + "mlperf_inference_device": "gaudi" +} diff --git a/cm-mlops/cfg/benchmark-list/mlperf-inference.yaml b/cm-mlops/cfg/benchmark-list/mlperf-inference.yaml index d4f937a3d1..e57764a486 100644 --- a/cm-mlops/cfg/benchmark-list/mlperf-inference.yaml +++ b/cm-mlops/cfg/benchmark-list/mlperf-inference.yaml @@ -25,3 +25,4 @@ supported_compute: - gpu,amd - accelerator,acc,qualcomm,ai,100,ai-100 - tpu,google +- gaudi,habana diff --git a/cm-mlops/script/gui/playground_howtorun.py b/cm-mlops/script/gui/playground_howtorun.py index 30ffeb6e1b..e533ac37fb 100644 --- a/cm-mlops/script/gui/playground_howtorun.py +++ b/cm-mlops/script/gui/playground_howtorun.py @@ -223,6 +223,7 @@ def page(st, params, action = ''): ############################################################################################ # Check if has customization extra = {} + skip = False script_tags = script_meta.get('tags_help','') if script_tags =='': @@ -265,11 +266,12 @@ def page(st, params, action = ''): r = func(ii) if r['return'] > 0 : return r - extra = r.get('extra',{}) + extra = r.get('extra', {}) + skip = r.get('skip', False) ############################################################################################ # Show official GUI - if script_path!='': + if script_path!='' and not skip: import script ii = {'st': st, diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index 71e525f199..a3b8b02404 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -352,17 +352,23 @@ def gui(i): inp = script_meta['input_description'] # Here we can update params - st.markdown('---') - st.markdown('**How would you like to run the MLPerf inference benchmark?**') - - v = compute_meta.get('mlperf_inference_device') if v!=None and v!='': inp['device']['force'] = v + + if v in ['tpu', 'gaudi']: + st.markdown('----') + st.markdown('**WARNING: unified CM workflow support for this hardware is pending - please [feel free to help](https://discord.gg/JjWNWXKxwT)!**') + return {'return':0, 'skip': True, 'end_html':end_html} + + st.markdown('---') + st.markdown('**How would you like to run the MLPerf inference benchmark?**') + r = misc.make_selector({'st':st, 'st_inputs':st_inputs_custom, 'params':params, 'key': 'mlperf_inference_device', 'desc':inp['device']}) device = r.get('value2') inp['device']['force'] = device + if device == 'cpu': inp['implementation']['choices']=['mlcommons-python', 'mlcommons-cpp', 'intel', 'ctuning-cpp-tflite'] inp['implementation']['default']='mlcommons-python' @@ -370,9 +376,13 @@ def gui(i): inp['backend']['default']='onnxruntime' elif device == 'rocm': inp['implementation']['force']='mlcommons-python' + inp['precision']['choices']=[''] + inp['precision']['force']='' inp['backend']['force']='onnxruntime' + st.markdown('*WARNING: CM-MLPerf inference workflow was not tested thoroughly for AMD GPU - please feel free to test and improve!*') elif device == 'qaic': inp['implementation']['force']='qualcomm' + inp['precision']['force']='' inp['backend']['force']='glow' @@ -433,7 +443,7 @@ def gui(i): inp['model']['choices'] = ['resnet50', 'retinanet'] st.markdown('*:red[[CM automation recipe for this implementation](https://github.com/mlcommons/ck/tree/master/cm-mlops/script/app-mlperf-inference-mlcommons-cpp)]*') elif implementation == 'mlcommons-python': - inp['precision']['default']='float32' + inp['precision']['force']='float32' if device == 'cuda': inp['backend']['choices']=['onnxruntime','pytorch','tf'] inp['backend']['default'] = 'onnxruntime' @@ -475,7 +485,9 @@ def gui(i): if backend == 'deepsparse': inp['model']['choices'] = ['resnet50', 'retinanet', 'bert-99', 'bert-99.9'] inp['model']['default'] = 'bert-99' + inp['precision']['choices'] = ['float32', 'int8'] inp['precision']['default'] = 'int8' + if 'force' in inp['precision']: del(inp['precision']['force']) diff --git a/cmr.yaml b/cmr.yaml index c5aa854127..b06dee4200 100644 --- a/cmr.yaml +++ b/cmr.yaml @@ -5,4 +5,4 @@ git: true prefix: cm-mlops -version: 2.0.1 +version: 2.0.3 diff --git a/docs/taskforce.md b/docs/taskforce.md index 54e4712212..72de0fc047 100644 --- a/docs/taskforce.md +++ b/docs/taskforce.md @@ -2,6 +2,9 @@ # MLCommons Task force on Automation and Reproducibility +***Announcement: we are peparing new tasks for Q2-Q3 2024 with MLCommons and looking for volunteers and another co-chair. + [Get in touch](mailto:gfursin@cknowledge.org) for more details!*** + ## Mission * Develop [reusable automation recipes and workflows](https://access.cknowledge.org/playground/?action=scripts) From 8b7dc9af39f15c0897a2fb42712ec266ef1e5c6f Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Wed, 20 Mar 2024 17:55:29 +0100 Subject: [PATCH 07/12] improved CM-MLPerf GUI for Intel implementation --- cm-mlops/script/gui/script.py | 3 ++ .../run-mlperf-inference-app/customize.py | 33 +++++++++++++++---- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/cm-mlops/script/gui/script.py b/cm-mlops/script/gui/script.py index 24b00f9bc2..362f6b9bc1 100644 --- a/cm-mlops/script/gui/script.py +++ b/cm-mlops/script/gui/script.py @@ -349,6 +349,9 @@ def page(i): if len(meta.get('docker',{}))>0: run_via_docker = st.toggle('Use Docker', key='run_via_docker', value=False) + if run_via_docker: + st.markdown("*WARNING: CM automatically generates containers for a give script - it's a beta functionality - feel free to [test and provide feedback](https://discord.gg/JjWNWXKxwT)!*") + action = 'docker' if run_via_docker else 'run' cli = 'cm {} script {} {}\n'.format(action, tags, flags) diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index a3b8b02404..eccf7e62ce 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -338,6 +338,7 @@ def gui(i): script_tags = i.get('script_tags', '') compute_meta = i.get('compute_meta',{}) + compute_tags = compute_meta.get('tags', []) bench_meta = i.get('bench_meta',{}) compute_uid = compute_meta.get('uid','') @@ -371,9 +372,12 @@ def gui(i): if device == 'cpu': inp['implementation']['choices']=['mlcommons-python', 'mlcommons-cpp', 'intel', 'ctuning-cpp-tflite'] - inp['implementation']['default']='mlcommons-python' - inp['backend']['choices']=['onnxruntime','deepsparse','pytorch','tf','tvm-onnx'] - inp['backend']['default']='onnxruntime' + if 'intel' in compute_tags: + inp['implementation']['default']='intel' + else: + inp['implementation']['default']='mlcommons-python' + inp['backend']['choices']=['onnxruntime','deepsparse','pytorch','tf','tvm-onnx'] + inp['backend']['default']='onnxruntime' elif device == 'rocm': inp['implementation']['force']='mlcommons-python' inp['precision']['choices']=[''] @@ -452,14 +456,14 @@ def gui(i): inp['precision']['force']='float32' inp['model']['force']='resnet50' st.markdown('*:red[[CM automation recipe for this implementation](https://github.com/mlcommons/ck/tree/master/cm-mlops/script/app-mlperf-inference-ctuning-cpp-tflite)]*') - elif implementation == 'nvidia': inp['backend']['force'] = 'tensorrt' st.markdown('*:red[[CM automation recipe for this implementation](https://github.com/mlcommons/ck/tree/master/cm-mlops/script/app-mlperf-inference-nvidia)]*') elif implementation == 'intel': - inp['model']['choices'] = ['bert-99', 'bert-99.9', 'gptj-99'] + inp['model']['choices'] = ['bert-99', 'gptj-99'] inp['model']['default'] = 'bert-99' - inp['precision']['force'] = 'uint8' + inp['precision']['choices'] = ['int8', 'int4'] + inp['precision']['default'] = 'int8' inp['category']['force'] = 'datacenter' inp['backend']['force'] = 'pytorch' # st.markdown('*:red[Note: Intel implementation require extra CM command to build and run Docker container - you will run CM commands to run MLPerf benchmarks there!]*') @@ -491,6 +495,8 @@ def gui(i): + ############################################################################# + # Model r = misc.make_selector({'st':st, 'st_inputs':st_inputs_custom, 'params':params, 'key': 'mlperf_inference_model', 'desc':inp['model']}) model = r.get('value2') inp['model']['force'] = model @@ -526,7 +532,20 @@ def gui(i): if github_doc_model == '': github_doc_model = model - extra_notes_online = '[Extra notes online](https://github.com/mlcommons/ck/tree/master/docs/mlperf/inference/{})\n'.format(github_doc_model) + model_cm_url='https://github.com/mlcommons/ck/tree/master/docs/mlperf/inference/{}'.format(github_doc_model) + extra_notes_online = '[Extra notes online]({})\n'.format(model_cm_url) + + st.markdown('*[CM GitHub docs for this model]({})*'.format(model_cm_url)) + + ############################################################################# + # Precision + if implementation == 'intel': + if model == 'bert-99': + inp['precision']['force'] = 'int8' + elif model == 'gptj-99': + inp['precision']['force'] = 'int4' + + r = misc.make_selector({'st':st, 'st_inputs':st_inputs_custom, 'params':params, 'key': 'mlperf_inference_precision', 'desc':inp['precision']}) precision = r.get('value2') From 5148f38faee562f2c4b8aa387ad5477967c85ff3 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 21 Mar 2024 12:01:00 +0100 Subject: [PATCH 08/12] continue improving CM-MLPerf GUI and fixing minor CM issues --- cm-mlops/automation/script/module.py | 6 +- .../benchmark-hardware-compute/amd-gpu.json | 2 +- .../google-tpu.json | 2 +- .../habana-gaudi.json | 2 +- .../nvidia-gpu-jetson-orin.yaml | 2 +- .../nvidia-gpu.json | 2 +- cm-mlops/script/get-git-repo/run.bat | 10 ++- cm-mlops/script/gui/script.py | 10 +-- .../script/run-mlperf-inference-app/_cm.yaml | 7 +- .../run-mlperf-inference-app/customize.py | 82 +++++++++++++++---- .../run-mlperf-inference-app/setup/i-intel.md | 2 +- .../setup/i-nvidia.md | 5 -- .../setup/i-qualcomm.md | 7 +- .../inference/all/README_nvidia_4090.md | 16 ++++ 14 files changed, 113 insertions(+), 42 deletions(-) create mode 100644 docs/mlperf/inference/all/README_nvidia_4090.md diff --git a/cm-mlops/automation/script/module.py b/cm-mlops/automation/script/module.py index 2cbd9b36f3..49ef12187c 100644 --- a/cm-mlops/automation/script/module.py +++ b/cm-mlops/automation/script/module.py @@ -1593,8 +1593,10 @@ def _run(self, i): # Check if the cached entry is dependent on any other cached entry - if dependent_cached_path != '' and not os.path.samefile(cached_path, dependent_cached_path): - cached_meta['dependent_cached_path'] = dependent_cached_path + if dependent_cached_path != '': + if os.path.isdir(cached_path) and os.path.isdir(dependent_cached_path): + if not os.path.samefile(cached_path, dependent_cached_path): + cached_meta['dependent_cached_path'] = dependent_cached_path ii = {'action': 'update', 'automation': self.meta['deps']['cache'], diff --git a/cm-mlops/cfg/benchmark-hardware-compute/amd-gpu.json b/cm-mlops/cfg/benchmark-hardware-compute/amd-gpu.json index 0b740394ce..d70e1d1554 100644 --- a/cm-mlops/cfg/benchmark-hardware-compute/amd-gpu.json +++ b/cm-mlops/cfg/benchmark-hardware-compute/amd-gpu.json @@ -1,6 +1,6 @@ { "uid": "d8f06040f7294319", - "name": "AMD GPU", + "name": "AMD - GPU", "tags": "gpu,amd", "mlperf_inference_device": "rocm" } diff --git a/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json b/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json index 43c0c6cb78..2bb4d22cf5 100644 --- a/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json +++ b/cm-mlops/cfg/benchmark-hardware-compute/google-tpu.json @@ -1,6 +1,6 @@ { "uid": "b3be7ac9ef954f5a", - "name": "Google TPU", + "name": "Google - TPU", "tags": "tpu,google", "mlperf_inference_device": "tpu" } diff --git a/cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json b/cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json index c6784c0c07..b6caa96554 100644 --- a/cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json +++ b/cm-mlops/cfg/benchmark-hardware-compute/habana-gaudi.json @@ -1,6 +1,6 @@ { "uid": "a42388a2a8cd412c", - "name": "Habana Gauidi 2", + "name": "Intel/Habana - Gauidi 2", "tags": "gaudi,habana", "mlperf_inference_device": "gaudi" } diff --git a/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu-jetson-orin.yaml b/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu-jetson-orin.yaml index c2f8f534a3..d8b9787c65 100644 --- a/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu-jetson-orin.yaml +++ b/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu-jetson-orin.yaml @@ -2,6 +2,6 @@ uid: fe379ecd1e054a00 tags: gpu,nvidia,jetson,orin -name: "Nvidia GPU - Jetson Orin" +name: "Nvidia - GPU - Jetson Orin" mlperf_inference_device: cuda diff --git a/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu.json b/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu.json index 07af7aa8c4..5bc7582532 100644 --- a/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu.json +++ b/cm-mlops/cfg/benchmark-hardware-compute/nvidia-gpu.json @@ -1,6 +1,6 @@ { "uid": "fe379ecd1e054a00", - "name": "Nvidia GPU", + "name": "Nvidia - GPU", "tags": "gpu,nvidia", "mlperf_inference_device": "cuda" } diff --git a/cm-mlops/script/get-git-repo/run.bat b/cm-mlops/script/get-git-repo/run.bat index 0fdc91dd3f..5288ab3861 100644 --- a/cm-mlops/script/get-git-repo/run.bat +++ b/cm-mlops/script/get-git-repo/run.bat @@ -8,6 +8,8 @@ rem cd inference rem git checkout -b "%CM_GIT_CHECKOUT%" rem +rem Next line allows ERRORLEVEL inside if statements! +setlocal enabledelayedexpansion set CUR_DIR=%cd% set SCRIPT_DIR=%CM_TMP_CURRENT_SCRIPT_PATH% @@ -23,13 +25,13 @@ if not exist %CM_TMP_GIT_PATH% ( echo Cloning %CM_GIT_REPO_NAME% from %CM_GIT_URL% echo "%CM_GIT_CLONE_CMD%" %CM_GIT_CLONE_CMD% - IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% + IF !ERRORLEVEL! NEQ 0 EXIT !ERRORLEVEL! cd %folder% if not "%CM_GIT_SHA%" == "" ( echo. echo. git checkout "%CM_GIT_CHECKOUT%" - IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% + IF !ERRORLEVEL! NEQ 0 EXIT !ERRORLEVEL! ) ) else ( @@ -43,7 +45,7 @@ if not "%CM_GIT_SUBMODULES%" == "" ( echo. echo Initializing submodule %%s git submodule update --init %%s - IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% + IF !ERRORLEVEL! NEQ 0 EXIT !ERRORLEVEL! ) ) @@ -51,7 +53,7 @@ if "%CM_GIT_PATCH%" == "yes" ( for %%x in (%CM_GIT_PATCH_FILEPATHS%) do ( echo Applying patch %%x ... git apply %%x - IF %ERRORLEVEL% NEQ 0 EXIT %ERRORLEVEL% + IF !ERRORLEVEL! NEQ 0 EXIT !ERRORLEVEL! ) ) diff --git a/cm-mlops/script/gui/script.py b/cm-mlops/script/gui/script.py index 362f6b9bc1..9a8bc0cfeb 100644 --- a/cm-mlops/script/gui/script.py +++ b/cm-mlops/script/gui/script.py @@ -346,7 +346,7 @@ def page(i): ############################################################################ run_via_docker = False - if len(meta.get('docker',{}))>0: + if not extra.get('skip_script_docker_func', False) and len(meta.get('docker',{}))>0: run_via_docker = st.toggle('Use Docker', key='run_via_docker', value=False) if run_via_docker: @@ -367,14 +367,14 @@ def page(i): ############################################################################ - extra_setup = extra.get('extra_setup','') - if extra_setup!='': + extra_setup = extra.get('extra_setup','').strip() + if len(extra_setup)>2: show_extra_setup_notes = st.toggle('Show extra setup notes?', value = True) if show_extra_setup_notes: - st.markdown('---') +# st.markdown('---') st.markdown(extra_setup) - st.markdown('---') +# st.markdown('---') show_python_api = st.toggle('Run via Python API', value=False) diff --git a/cm-mlops/script/run-mlperf-inference-app/_cm.yaml b/cm-mlops/script/run-mlperf-inference-app/_cm.yaml index 09ba3dc78a..df538c9db4 100644 --- a/cm-mlops/script/run-mlperf-inference-app/_cm.yaml +++ b/cm-mlops/script/run-mlperf-inference-app/_cm.yaml @@ -26,7 +26,7 @@ tags: - inference - reference -tags_help: "run-mlperf-inference" +tags_help: "run-mlperf,inference" default_env: CM_MLPERF_IMPLEMENTATION: reference @@ -247,7 +247,6 @@ variations: group: benchmark-version r4.0: - default: true env: CM_MLPERF_INFERENCE_VERSION: '4.0' CM_RUN_MLPERF_INFERENCE_APP_DEFAULTS: r4.0_default @@ -394,6 +393,10 @@ input_description: default: test desc: MLPerf execution mode sort: 700 + sut: + default: '' + desc: SUT configuration (if known) + sort: 750 submitter: default: CTuning desc: Submitter name (without space) diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index eccf7e62ce..290a079ee2 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -350,6 +350,9 @@ def gui(i): end_html = '' + extra = {} + add_to_st_inputs = {} + inp = script_meta['input_description'] # Here we can update params @@ -369,6 +372,7 @@ def gui(i): device = r.get('value2') inp['device']['force'] = device + if device == 'cpu': inp['implementation']['choices']=['mlcommons-python', 'mlcommons-cpp', 'intel', 'ctuning-cpp-tflite'] @@ -380,7 +384,6 @@ def gui(i): inp['backend']['default']='onnxruntime' elif device == 'rocm': inp['implementation']['force']='mlcommons-python' - inp['precision']['choices']=[''] inp['precision']['force']='' inp['backend']['force']='onnxruntime' st.markdown('*WARNING: CM-MLPerf inference workflow was not tested thoroughly for AMD GPU - please feel free to test and improve!*') @@ -440,6 +443,8 @@ def gui(i): r = get_url(script_url, script_path, 'faq', implementation, 'FAQ online') if r['return'] == 0: url_faq_implementation = r['url_online'] + can_have_docker_flag = False + if implementation == 'mlcommons-cpp': # inp['backend']['choices'] = ['onnxruntime'] inp['precision']['force']='float32' @@ -458,6 +463,8 @@ def gui(i): st.markdown('*:red[[CM automation recipe for this implementation](https://github.com/mlcommons/ck/tree/master/cm-mlops/script/app-mlperf-inference-ctuning-cpp-tflite)]*') elif implementation == 'nvidia': inp['backend']['force'] = 'tensorrt' + extra['skip_script_docker_func'] = True + can_have_docker_flag = True st.markdown('*:red[[CM automation recipe for this implementation](https://github.com/mlcommons/ck/tree/master/cm-mlops/script/app-mlperf-inference-nvidia)]*') elif implementation == 'intel': inp['model']['choices'] = ['bert-99', 'gptj-99'] @@ -466,12 +473,16 @@ def gui(i): inp['precision']['default'] = 'int8' inp['category']['force'] = 'datacenter' inp['backend']['force'] = 'pytorch' + inp['sut']['default'] = 'sapphire-rapids.112c' + can_have_docker_flag = True + extra['skip_script_docker_func'] = True # st.markdown('*:red[Note: Intel implementation require extra CM command to build and run Docker container - you will run CM commands to run MLPerf benchmarks there!]*') st.markdown('*:red[[CM automation recipe for this implementation](https://github.com/mlcommons/ck/tree/master/cm-mlops/script/reproduce-mlperf-inference-intel)]*') elif implementation == 'qualcomm': - inp['model']['choices'] = ['resnet50', 'retinanet', 'bert-99', 'bert-99.9'] - inp['model']['default'] = 'bert-99.9' + inp['model']['choices'] = ['resnet50', 'retinanet', 'bert-99'] + inp['model']['default'] = 'bert-99' inp['precision']['default'] = 'float16' + extra['skip_script_docker_func'] = True st.markdown('*:red[[CM automation recipe for this implementation](https://github.com/mlcommons/ck/tree/master/cm-mlops/script/reproduce-mlperf-inference-qualcomm)]*') @@ -535,7 +546,7 @@ def gui(i): model_cm_url='https://github.com/mlcommons/ck/tree/master/docs/mlperf/inference/{}'.format(github_doc_model) extra_notes_online = '[Extra notes online]({})\n'.format(model_cm_url) - st.markdown('*[CM GitHub docs for this model]({})*'.format(model_cm_url)) + st.markdown('*[CM-MLPerf GitHub docs for this model]({})*'.format(model_cm_url)) ############################################################################# # Precision @@ -544,16 +555,50 @@ def gui(i): inp['precision']['force'] = 'int8' elif model == 'gptj-99': inp['precision']['force'] = 'int4' + elif implementation == 'qualcomm': + if model == 'resnet50': + inp['precision']['print'] = 'int8' + elif model == 'retinanet': + inp['precision']['print'] = 'int8' + elif model == 'bert-99': + inp['precision']['print'] = 'int8/float16' + + if inp['precision'].get('force','')=='': + x = inp['precision'].get('print','') + if x!='': + st.markdown('**{}**: {}'.format(inp['precision']['desc'], x)) + else: + r = misc.make_selector({'st':st, 'st_inputs':st_inputs_custom, 'params':params, 'key': 'mlperf_inference_precision', 'desc':inp['precision']}) + precision = r.get('value2') + inp['precision']['force'] = precision - + ############################################################################# + # Benchmark version - r = misc.make_selector({'st':st, 'st_inputs':st_inputs_custom, 'params':params, 'key': 'mlperf_inference_precision', 'desc':inp['precision']}) - precision = r.get('value2') - inp['precision']['force'] = precision + script_meta_variations = script_meta['variations'] + + choices = [''] + [k for k in script_meta_variations if script_meta_variations[k].get('group','') == 'benchmark-version'] + desc = {'choices': choices, 'default':choices[0], 'desc':'Force specific benchmark version?'} + r = misc.make_selector({'st':st, 'st_inputs':st_inputs_custom, 'params':params, 'key': 'mlperf_inference_version', 'desc':desc}) + benchmark_version = r.get('value2') + if benchmark_version!='': + params['~~benchmark-version']=[benchmark_version] + ############################################################################# + # Run via Docker container + if can_have_docker_flag: - + default_choice = 'yes - run in container' + + choices = [default_choice, 'no - run natively'] + desc = {'choices': choices, 'default':choices[0], 'desc':'Should CM script prepare and run Docker container in interactive mode to run MLPerf? You can then copy/paste CM commands generated by this GUI to benchmark different models.'} + r = misc.make_selector({'st':st, 'st_inputs':st_inputs_custom, 'params':params, 'key': 'mlperf_inference_docker', 'desc':desc}) + benchmark_docker = r.get('value2') + + if benchmark_docker == 'yes - run in container': + add_to_st_inputs['@docker']=True + add_to_st_inputs['@docker_cache']='no' ############################################################################# # Prepare submission @@ -712,9 +757,9 @@ def gui(i): if x != '': x+='\n\n' x+=backend_setup - extra = {'extra_notes_online':extra_notes_online, - 'extra_faq_online':url_faq_implementation, - 'extra_setup':x} + extra['extra_notes_online'] = extra_notes_online + extra['extra_faq_online'] = url_faq_implementation + extra['extra_setup'] = x ############################################################################# value_reproduce = inp.get('repro',{}).get('force', False) @@ -723,17 +768,18 @@ def gui(i): explore = st.toggle('Explore/tune benchmark (batch size, threads, etc)?', value = False) if reproduce or explore: - inp['repro']['force'] = True - extra['use_experiment'] = True - extra['add_to_st_inputs'] = { + add_to_st_inputs.update({ "@repro_extra.run-mlperf-inference-app.bench_uid": bench_uid, "@repro_extra.run-mlperf-inference-app.compute_uid": compute_uid, '@results_dir':'{{CM_EXPERIMENT_PATH3}}', '@submission_dir':'{{CM_EXPERIMENT_PATH3}}' - } + }) + + inp['repro']['force'] = True + extra['use_experiment'] = True if explore: - extra['add_to_st_inputs']['@batch_size']='{{CM_EXPLORE_BATCH_SIZE{[1,2,4,8]}}}' + add_to_st_inputs['@batch_size']='{{CM_EXPLORE_BATCH_SIZE{[1,2,4,8]}}}' ############################################################################# debug = st.toggle('Debug and run MLPerf benchmark natively from command line after CM auto-generates CMD?', value=False) @@ -741,4 +787,6 @@ def gui(i): inp['debug']['force'] = True + extra['add_to_st_inputs'] = add_to_st_inputs + return {'return':0, 'end_html':end_html, 'extra':extra} diff --git a/cm-mlops/script/run-mlperf-inference-app/setup/i-intel.md b/cm-mlops/script/run-mlperf-inference-app/setup/i-intel.md index 5f282702bb..a7079b7bcb 100644 --- a/cm-mlops/script/run-mlperf-inference-app/setup/i-intel.md +++ b/cm-mlops/script/run-mlperf-inference-app/setup/i-intel.md @@ -1 +1 @@ - \ No newline at end of file +CM can run Intel's MLPerf inference benchmark implementation either natively or inside a container. diff --git a/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md b/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md index 9f2dcf1b34..d2562c64db 100644 --- a/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md +++ b/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md @@ -5,10 +5,5 @@ cm docker script --tags=build,nvidia,inference,server ``` You can then copy/paste CM commands generated by this GUI to run MLPerf benchmarks. -You can also benchmark all models in one go using this command: -```bash -cmr "benchmark any _phoenix" -``` - Container will require around 60GB of free disk space. Docker cache and running all models (without DLRM) will require ~600 GB free disk space. diff --git a/cm-mlops/script/run-mlperf-inference-app/setup/i-qualcomm.md b/cm-mlops/script/run-mlperf-inference-app/setup/i-qualcomm.md index 5f282702bb..c0aef51871 100644 --- a/cm-mlops/script/run-mlperf-inference-app/setup/i-qualcomm.md +++ b/cm-mlops/script/run-mlperf-inference-app/setup/i-qualcomm.md @@ -1 +1,6 @@ - \ No newline at end of file +* CM runs Qualcomm's MLPerf inference benchmark implementation natively. +* [QAIC SDK](https://github.com/quic/software-kit-for-qualcomm-cloud-ai-100-cc) must be installed. +* If you run CM-MLPerf for Qualcomm in a cloud, you may need to update/change AIM with an SDK version compatible with the Qualcomm's MLPerf implementation. + Please check [cTuning's MLPerf inference results](https://mlcommons.org/benchmarks/inference-datacenter/) to see the working QAIC SDK versions. + + diff --git a/docs/mlperf/inference/all/README_nvidia_4090.md b/docs/mlperf/inference/all/README_nvidia_4090.md new file mode 100644 index 0000000000..de6781661e --- /dev/null +++ b/docs/mlperf/inference/all/README_nvidia_4090.md @@ -0,0 +1,16 @@ +[ [Back to MLPerf inference benchmarks index](../README.md) ] + +*Note: from Feb 2024, we suggest you to use [this GUI](https://access.cknowledge.org/playground/?action=howtorun&bench_uid=39877bb63fb54725) + to configure MLPerf inference benchmark, generate CM commands to run it across different implementations, models, data sets, software + and hardware, and prepare your submissions.* + + +## MLPerf inference v3.1 + +You can benchmark all models using Nvidia MLPerf inference implementation in one go using this CM command: +```bash +cmr "benchmark any _phoenix" +``` + +*This CM command tested for MLPerf inference v3.1*. + From 7dd6f8ab8082f58f7c9d3ce6fd5ffc9803426922 Mon Sep 17 00:00:00 2001 From: Grigori Fursin Date: Thu, 21 Mar 2024 12:30:21 +0100 Subject: [PATCH 09/12] improving CM-MLPerf GUI for Nvidia --- .../script/run-mlperf-inference-app/customize.py | 5 +++++ .../run-mlperf-inference-app/setup/i-nvidia.md | 12 +++--------- docs/mlperf/setup/setup-nvidia.md | 11 +++++++++++ 3 files changed, 19 insertions(+), 9 deletions(-) create mode 100644 docs/mlperf/setup/setup-nvidia.md diff --git a/cm-mlops/script/run-mlperf-inference-app/customize.py b/cm-mlops/script/run-mlperf-inference-app/customize.py index 290a079ee2..ae15277ec4 100644 --- a/cm-mlops/script/run-mlperf-inference-app/customize.py +++ b/cm-mlops/script/run-mlperf-inference-app/customize.py @@ -365,6 +365,11 @@ def gui(i): st.markdown('**WARNING: unified CM workflow support for this hardware is pending - please [feel free to help](https://discord.gg/JjWNWXKxwT)!**') return {'return':0, 'skip': True, 'end_html':end_html} + elif 'orin' in compute_tags: + st.markdown('----') + st.markdown('**WARNING: we need to encode CM knowledge from [this Orin setp](https://github.com/mlcommons/ck/blob/master/docs/mlperf/setup/setup-nvidia-jetson-orin.md) to this GUI!**') + return {'return':0, 'skip': True, 'end_html':end_html} + st.markdown('---') st.markdown('**How would you like to run the MLPerf inference benchmark?**') diff --git a/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md b/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md index d2562c64db..bfa50410c3 100644 --- a/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md +++ b/cm-mlops/script/run-mlperf-inference-app/setup/i-nvidia.md @@ -1,9 +1,3 @@ -Note: Nvidia implementation require extra CM command to build and run Docker container - -```bash -cm docker script --tags=build,nvidia,inference,server -``` -You can then copy/paste CM commands generated by this GUI to run MLPerf benchmarks. - -Container will require around 60GB of free disk space. -Docker cache and running all models (without DLRM) will require ~600 GB free disk space. +* Container will require around 60GB of free disk space. +* Docker cache and running all models (without DLRM) will require ~600 GB free disk space. +* When you get into an interactive Docker mode, you can copy/paste CM commands generated by this GUI to benchmark different models. diff --git a/docs/mlperf/setup/setup-nvidia.md b/docs/mlperf/setup/setup-nvidia.md new file mode 100644 index 0000000000..de966e342b --- /dev/null +++ b/docs/mlperf/setup/setup-nvidia.md @@ -0,0 +1,11 @@ +[ [Back to MLPerf benchmarks index](../README.md) ] + +## MLPerf inference v3.1 + +Nvidia implementation requires extra CM command to build and run Docker container + +```bash +cm docker script --tags=build,nvidia,inference,server +``` + +You can then copy/paste CM commands generated by this GUI to run MLPerf benchmarks. From 567cd80983b2b9a03ebfd631cceda0a892e964bb Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 11 Apr 2024 15:11:27 +0530 Subject: [PATCH 10/12] Upgrade the mlc version for get-mlperf-inference custom version --- cm-mlops/script/get-mlperf-inference-src/_cm.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cm-mlops/script/get-mlperf-inference-src/_cm.json b/cm-mlops/script/get-mlperf-inference-src/_cm.json index 479dd2c4c1..3b7b4ea839 100644 --- a/cm-mlops/script/get-mlperf-inference-src/_cm.json +++ b/cm-mlops/script/get-mlperf-inference-src/_cm.json @@ -194,7 +194,7 @@ "versions": { "custom": { "env": { - "CM_MLPERF_LAST_RELEASE": "v3.1" + "CM_MLPERF_LAST_RELEASE": "v4.0" } }, "deepsparse": { From 51ed188630ec18f15152a6330b853b284849d27c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 13 Apr 2024 00:42:32 +0530 Subject: [PATCH 11/12] Fix nvmitten install --- cm-mlops/script/get-nvidia-mitten/_cm.json | 3 +++ cm-mlops/script/get-nvidia-mitten/run.sh | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cm-mlops/script/get-nvidia-mitten/_cm.json b/cm-mlops/script/get-nvidia-mitten/_cm.json index c184358447..94675091bd 100644 --- a/cm-mlops/script/get-nvidia-mitten/_cm.json +++ b/cm-mlops/script/get-nvidia-mitten/_cm.json @@ -16,6 +16,9 @@ ], "tags": "get,python3" }, + { + "tags": "get,generic-python-lib,_pycuda" + }, { "tags": "get,git,_repo.https://github.com/NVIDIA/mitten", "force_env_keys": [ diff --git a/cm-mlops/script/get-nvidia-mitten/run.sh b/cm-mlops/script/get-nvidia-mitten/run.sh index bbb9d5222b..28b1ea4ce1 100644 --- a/cm-mlops/script/get-nvidia-mitten/run.sh +++ b/cm-mlops/script/get-nvidia-mitten/run.sh @@ -1,3 +1,4 @@ #!/bin/bash - -echo "TBD" +cd ${CM_NVIDIA_MITTEN_SRC} +${CM_PYTHON_BIN_WITH_PATH} -m pip install . +test $? -eq 0 || exit $? From 22ddbf7f570dd7d1a690d7258bc911176f43112f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Sat, 13 Apr 2024 01:05:23 +0530 Subject: [PATCH 12/12] Support nvmitten install from src --- .../script/app-mlperf-inference-nvidia/_cm.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cm-mlops/script/app-mlperf-inference-nvidia/_cm.yaml b/cm-mlops/script/app-mlperf-inference-nvidia/_cm.yaml index 578a820e99..19e789ae0d 100644 --- a/cm-mlops/script/app-mlperf-inference-nvidia/_cm.yaml +++ b/cm-mlops/script/app-mlperf-inference-nvidia/_cm.yaml @@ -262,6 +262,19 @@ deps: - run_harness - tags: get,generic-python-lib,_package.nvmitten,_path./opt/nvmitten-0.1.3-cp38-cp38-linux_x86_64.whl + enable_if_env: + CM_RUN_STATE_DOCKER: + - 'yes' + - True + - 'True' + + - tags: get,nvidia,mitten + skip_if_env: + CM_RUN_STATE_DOCKER: + - 'yes' + - True + - 'True' + prehook_deps: ######################################################################## # Install GPTJ-6B model