Merge branch 'mlperf-inference' into mixtral+gha+selfhosted

GATEOverflow · Oct 8, 2024 · 72cf058 · 72cf058
2 parents bcec9ec + 454a92b
commit 72cf058
Show file tree

Hide file tree

Showing 15 changed files with 206 additions and 30 deletions.
diff --git a/.github/workflows/test-mlperf-inference-dlrm.yml b/.github/workflows/test-mlperf-inference-dlrm.yml
@@ -0,0 +1,49 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: MLPerf inference DLRM-v2
+
+on:
+  schedule:
+    - cron: "30 21 * * *"
+
+jobs:
+  build_reference:
+    if: github.repository_owner == 'gateoverflow'
+    runs-on: [ self-hosted, GO-spr, linux, x64 ]
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.12" ]
+        backend: [ "pytorch" ]
+        device: [ "cpu", "cuda" ]
+
+    steps:
+    - name: Test MLPerf Inference DLRM-v2 reference implementation
+      run: |
+        source gh_action/bin/deactivate || python3 -m venv gh_action
+        source gh_action/bin/activate
+        export CM_REPOS=$HOME/GH_CM
+        python3 -m pip install cm4mlops
+        cm pull repo
+        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=dlrm-v2-99 --implementation=reference --batch_size=1 --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
+
+  build_intel:
+    if: github.repository_owner == 'gateoverflow'
+    runs-on: [ self-hosted, GO-spr, linux, x64 ]
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [ "3.12" ]
+        backend: [ "pytorch" ]
+        device: [ "cpu" ]
+
+    steps:
+    - name: Test MLPerf Inference DLRM-v2 INTEL implementation
+      run: |
+        source gh_action/bin/deactivate || python3 -m venv gh_action
+        source gh_action/bin/activate
+        export CM_REPOS=$HOME/GH_CM
+        python3 -m pip install cm4mlops
+        cm pull repo
+        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --model=dlrm-v2-99 --implementation=intel --batch_size=1 --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }}  --docker --quiet --test_query_count=1 --target_qps=1 --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
diff --git a/automation/script/module.py b/automation/script/module.py
@@ -413,10 +413,6 @@ def _run(self, i):
 
         ignore_script_error = i.get('ignore_script_error', False)
 
-        # Get constant env and state
-        const = i.get('const',{})
-        const_state = i.get('const_state',{})
-
         # Detect current path and record in env for further use in native scripts
         current_path = os.path.abspath(os.getcwd())
         r = _update_env(env, 'CM_TMP_CURRENT_PATH', current_path)
@@ -838,8 +834,8 @@ def _run(self, i):
         script_artifact_env = meta.get('env',{})
         env.update(script_artifact_env)
 
-
-
+        script_artifact_state = meta.get('state',{})
+        utils.merge_dicts({'dict1':state, 'dict2':script_artifact_state, 'append_lists':True, 'append_unique':True})
 
 
 
@@ -853,7 +849,7 @@ def _run(self, i):
 
 
         # STEP 700: Overwrite env with keys from the script input (to allow user friendly CLI)
-        #           IT HAS THE PRIORITY OVER meta['default_env'] and meta['env']
+        #           IT HAS THE PRIORITY OVER meta['default_env'] and meta['env'] but not over the meta from versions/variations
         #           (env OVERWRITE - user enforces it from CLI)
         #           (it becomes const)
         if input_mapping:
@@ -866,7 +862,9 @@ def _run(self, i):
         #    update_env_from_input_mapping(const, i, docker_input_mapping)
 
 
-
+        #Update env/state with cost
+        env.update(const)
+        utils.merge_dicts({'dict1':state, 'dict2':const_state, 'append_lists':True, 'append_unique':True})
 
 
 
@@ -882,7 +880,7 @@ def _run(self, i):
         variations = script_artifact.meta.get('variations', {})
         state['docker'] = meta.get('docker', {})
 
-        r = self._update_state_from_variations(i, meta, variation_tags, variations, env, state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, add_deps_recursive, run_state, recursion_spaces, verbose)
+        r = self._update_state_from_variations(i, meta, variation_tags, variations, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, add_deps_recursive, run_state, recursion_spaces, verbose)
         if r['return'] > 0:
             return r
 
@@ -952,7 +950,7 @@ def _run(self, i):
 
         if version!='' and version in versions:
             versions_meta = versions[version]
-            r = update_state_from_meta(versions_meta, env, state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
+            r = update_state_from_meta(versions_meta, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
             if r['return']>0: return r
             adr=get_adr(versions_meta)
             if adr:
@@ -1328,7 +1326,7 @@ def _run(self, i):
 
                     if default_version in versions:
                         versions_meta = versions[default_version]
-                        r = update_state_from_meta(versions_meta, env, state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
+                        r = update_state_from_meta(versions_meta, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
                         if r['return']>0: return r
 
                         if "add_deps_recursive" in versions_meta:
@@ -1374,7 +1372,6 @@ def _run(self, i):
                 r = update_env_with_values(env)
                 if r['return']>0: return r
 
-
             # Clean some output files
             clean_tmp_files(clean_files, recursion_spaces)
 
@@ -1451,8 +1448,12 @@ def _run(self, i):
             elif pip_version_max != '':
                 pip_version_string = '<='+pip_version_max
 
+            env.update(const)
+            utils.merge_dicts({'dict1':state, 'dict2':const_state, 'append_lists':True, 'append_unique':True})
+
             r = _update_env(env, 'CM_TMP_PIP_VERSION_STRING', pip_version_string)
             if r['return']>0: return r
+
             if pip_version_string != '':
                 logging.debug(recursion_spaces+'    # potential PIP version string (if needed): '+pip_version_string)
 
@@ -1462,10 +1463,6 @@ def _run(self, i):
 
                 logging.debug(recursion_spaces+'  - Running preprocess ...')
 
-                # Update env and state with const
-                utils.merge_dicts({'dict1':env, 'dict2':const, 'append_lists':True, 'append_unique':True})
-                utils.merge_dicts({'dict1':state, 'dict2':const_state, 'append_lists':True, 'append_unique':True})
-
                 run_script_input['run_state'] = run_state
 
                 ii = copy.deepcopy(customize_common_input)
@@ -1916,7 +1913,7 @@ def _dump_version_info_for_script(self, output_dir = os.getcwd(), quiet = False,
         return {'return': 0}
 
     ######################################################################################
-    def _update_state_from_variations(self, i, meta, variation_tags, variations, env, state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, add_deps_recursive, run_state, recursion_spaces, verbose):
+    def _update_state_from_variations(self, i, meta, variation_tags, variations, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, add_deps_recursive, run_state, recursion_spaces, verbose):
 
         # Save current explicit variations
         import copy
@@ -2019,7 +2016,7 @@ def _update_state_from_variations(self, i, meta, variation_tags, variations, env
                 if variation_tag_dynamic_suffix:
                     self._update_variation_meta_with_dynamic_suffix(variation_meta, variation_tag_dynamic_suffix)
 
-                r = update_state_from_meta(variation_meta, env, state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
+                r = update_state_from_meta(variation_meta, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
                 if r['return']>0: return r
 
                 if variation_meta.get('script_name', '')!='':
@@ -2050,7 +2047,7 @@ def _update_state_from_variations(self, i, meta, variation_tags, variations, env
 
                         combined_variation_meta = variations[combined_variation]
 
-                        r = update_state_from_meta(combined_variation_meta, env, state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
+                        r = update_state_from_meta(combined_variation_meta, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys_from_meta, new_state_keys_from_meta, i)
                         if r['return']>0: return r
 
                         adr=get_adr(combined_variation_meta)
@@ -3012,8 +3009,8 @@ def _run_deps(self, deps, clean_env_keys_deps, env, state, const, const_state, a
                             'remembered_selections': remembered_selections,
                             'env':env,
                             'state':state,
-                            'const':const,
-                            'const_state':const_state,
+                            'const':copy.deepcopy(const),
+                            'const_state':copy.deepcopy(const_state),
                             'add_deps_recursive':add_deps_recursive,
                             'debug_script_tags':debug_script_tags,
                             'verbose':verbose,
@@ -3040,6 +3037,11 @@ def _run_deps(self, deps, clean_env_keys_deps, env, state, const, const_state, a
                     r = update_env_with_values(env)
                     if r['return']>0: return r
 
+                    #Update env/state with cost
+                    env.update(const)
+                    utils.merge_dicts({'dict1':state, 'dict2':const_state, 'append_lists':True, 'append_unique':True})
+
+
         return {'return': 0}
 
     ##############################################################################
@@ -4418,7 +4420,7 @@ def update_env_with_values(env, fail_on_not_found=False, extra_env={}):
 
         # Check cases such as --env.CM_SKIP_COMPILE
         if type(value)==bool:
-            env[key] = str(value)
+            env[key] = value
             continue
 
         tmp_values = re.findall(r'<<<(.*?)>>>', str(value))
@@ -5110,20 +5112,31 @@ def update_env_from_input_mapping(env, inp, input_mapping):
             env[input_mapping[key]] = inp[key]
 
 ##############################################################################
-def update_state_from_meta(meta, env, state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys, new_state_keys, i):
+def update_state_from_meta(meta, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys, new_state_keys, i):
     """
     Internal: update env and state from meta
     """
 
     default_env = meta.get('default_env',{})
     for key in default_env:
         env.setdefault(key, default_env[key])
+
     update_env = meta.get('env', {})
     env.update(update_env)
 
+    update_const = meta.get('const', {})
+    if update_const:
+        const.update(update_const)
+        env.update(const)
+
     update_state = meta.get('state', {})
     utils.merge_dicts({'dict1':state, 'dict2':update_state, 'append_lists':True, 'append_unique':True})
 
+    update_const_state = meta.get('const_state', {})
+    if const_state:
+        utils.merge_dicts({'dict1':const_state, 'dict2':update_const_state, 'append_lists':True, 'append_unique':True})
+        utils.merge_dicts({'dict1':state, 'dict2':const_state, 'append_lists':True, 'append_unique':True})
+
     new_deps = meta.get('deps', [])
     if len(new_deps)>0:
         append_deps(deps, new_deps)

diff --git a/automation/script/module_misc.py b/automation/script/module_misc.py
@@ -1393,6 +1393,8 @@ def dockerfile(i):
 
     env=i.get('env', {})
     state = i.get('state', {})
+    const=i.get('const', {})
+    const_state = i.get('const_state', {})
     script_automation = i['self_module']
 
     dockerfile_env=i.get('dockerfile_env', {})
@@ -1420,7 +1422,7 @@ def dockerfile(i):
         state['docker'] = docker_settings
         add_deps_recursive = i.get('add_deps_recursive', {})
 
-        r = script_automation._update_state_from_variations(i, meta, variation_tags, variations, env, state, deps = [], post_deps = [], prehook_deps = [], posthook_deps = [], new_env_keys_from_meta = [], new_state_keys_from_meta = [], add_deps_recursive = add_deps_recursive, run_state = {}, recursion_spaces='', verbose = False)
+        r = script_automation._update_state_from_variations(i, meta, variation_tags, variations, env, state, const, const_state, deps = [], post_deps = [], prehook_deps = [], posthook_deps = [], new_env_keys_from_meta = [], new_state_keys_from_meta = [], add_deps_recursive = add_deps_recursive, run_state = {}, recursion_spaces='', verbose = False)
         if r['return'] > 0:
             return r
 
@@ -1741,6 +1743,8 @@ def docker(i):
     env['CM_RUN_STATE_DOCKER'] = False
     script_automation = i['self_module']
     state = i.get('state', {})
+    const = i.get('const', {})
+    const_state = i.get('const_state', {})
 
     tags_split = i.get('tags', '').split(",")
     variation_tags = [ t[1:] for t in tags_split if t.startswith("_") ]
@@ -1793,7 +1797,7 @@ def docker(i):
         state['docker'] = docker_settings
         add_deps_recursive = i.get('add_deps_recursive', {})
 
-        r = script_automation._update_state_from_variations(i, meta, variation_tags, variations, env, state, deps = [], post_deps = [], prehook_deps = [], posthook_deps = [], new_env_keys_from_meta = [], new_state_keys_from_meta = [], add_deps_recursive = add_deps_recursive, run_state = {}, recursion_spaces='', verbose = False)
+        r = script_automation._update_state_from_variations(i, meta, variation_tags, variations, env, state, const, const_state, deps = [], post_deps = [], prehook_deps = [], posthook_deps = [], new_env_keys_from_meta = [], new_state_keys_from_meta = [], add_deps_recursive = add_deps_recursive, run_state = {}, recursion_spaces='', verbose = False)
         if r['return'] > 0:
             return r
 

diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml
@@ -659,6 +659,7 @@ variations:
 
   3d-unet_,reference:
     docker:
+      image_name: mlperf-inference-mlcommons-python-implementation-3d-unet
       deps:
         - enable_if_env:
             CM_MLPERF_DATASET_3DUNET_DOWNLOAD_TO_HOST:
@@ -698,6 +699,7 @@ variations:
 
   sdxl,reference,float16:
     docker:
+      image_name: mlperf-inference-mlcommons-python-implementation-sdxl-float16
       deps:
         - enable_if_env:
             CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST:
@@ -706,6 +708,7 @@ variations:
 
   sdxl,reference,bfloat16:
     docker:
+      image_name: mlperf-inference-mlcommons-python-implementation-sdxl-bfloat16
       deps:
         - enable_if_env:
             CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST:
@@ -714,6 +717,7 @@ variations:
 
   sdxl,reference,float32:
     docker:
+      image_name: mlperf-inference-mlcommons-python-implementation-sdxl-float32
       deps:
         - enable_if_env:
             CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST:
@@ -765,6 +769,7 @@ variations:
 
   llama2-70b_,reference:
     docker:
+      image_name: mlperf-inference-mlcommons-python-implementation-llama2-70b
       deps:
         - enable_if_env:
             CM_MLPERF_MODEL_LLAMA2_70B_DOWNLOAD_TO_HOST:

diff --git a/script/get-cuda/_cm.yaml b/script/get-cuda/_cm.yaml
@@ -46,6 +46,7 @@ new_env_keys:
 - CUDA_PATH
 - CM_CUDA_*
 - CM_NVCC_*
+- CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX5
 - +PATH
 - +C_INCLUDE_PATH
 - +CPLUS_INCLUDE_PATH

diff --git a/script/get-cuda/customize.py b/script/get-cuda/customize.py
@@ -214,5 +214,6 @@ def postprocess(i):
         env['+ LDFLAGS'].append("-L"+x)
 
     env['CM_CUDA_VERSION_STRING'] = "cu"+env['CM_CUDA_VERSION'].replace(".", "")
+    env['CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX5'] = env['CM_CUDA_VERSION_STRING']
 
     return {'return':0, 'version': version}
diff --git a/script/get-mlperf-inference-sut-configs/customize.py b/script/get-mlperf-inference-sut-configs/customize.py
@@ -27,7 +27,7 @@ def postprocess(i):
     implementation_string = env['CM_MLPERF_SUT_NAME_IMPLEMENTATION_PREFIX'] if env.get('CM_MLPERF_SUT_NAME_IMPLEMENTATION_PREFIX', '') != '' else env.get('CM_MLPERF_IMPLEMENTATION', 'default')
 
     run_config = []
-    for i in range(1,5):
+    for i in range(1,6):
         if env.get(f'CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX{i}', '') != '':
             run_config.append(env.get(f'CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX{i}'))
 

diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml
@@ -43,6 +43,7 @@ input_mapping:
   category: CM_MLPERF_SUBMISSION_SYSTEM_TYPE
   clean: CM_MLPERF_CLEAN_ALL
   compliance: CM_MLPERF_LOADGEN_COMPLIANCE
+  custom_system_nvidia: CM_CUSTOM_SYSTEM_NVIDIA
   dashboard_wb_project: CM_MLPERF_DASHBOARD_WANDB_PROJECT
   dashboard_wb_user: CM_MLPERF_DASHBOARD_WANDB_USER
   debug: CM_DEBUG_SCRIPT_BENCHMARK_PROGRAM
@@ -140,7 +141,8 @@ deps:
 - tags: install,pip-package,for-cmind-python,_package.tabulate
 - tags: get,mlperf,inference,utils
 
-docker:
+#We use this script as a command generator to run docker via app-mlperf-inference script
+docker_off:
   mounts:
   - ${{ INSTALL_DATA_PATH }}:/install_data
   - ${{ DATA_PATH }}:/data
@@ -248,6 +250,7 @@ variations:
     - short
     env:
       CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX4: scc24-base
+      CM_DOCKER_IMAGE_NAME: scc24
     adr:
       coco2014-preprocessed:
         tags: _size.50,_with-sample-ids
@@ -271,6 +274,7 @@ variations:
         extra_cache_tags: "scc24-main"
     env:
       CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX4: scc24-main
+      CM_DOCKER_IMAGE_NAME: scc24
     deps:
       - tags: clean,nvidia,scratch,_sdxl,_downloaded-data
         extra_cache_rm_tags: scc24-base