diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 131f92a208298..0e922f06cb515 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -214,14 +214,21 @@ def _template_model_header(self, source_dir, metadata):
         with open(source_dir / "model.h", "r") as f:
             model_h_template = Template(f.read())
 
-        assert (
-            metadata["style"] == "full-model"
+        all_module_names = []
+        for name in metadata["modules"].keys():
+            all_module_names.append(name)
+
+        assert all(
+            metadata["modules"][mod_name]["style"] == "full-model" for mod_name in all_module_names
         ), "when generating AOT, expect only full-model Model Library Format"
 
-        template_values = {
-            "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][
+        workspace_size_bytes = 0
+        for mod_name in all_module_names:
+            workspace_size_bytes += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][
                 "workspace_size_bytes"
-            ],
+            ]
+        template_values = {
+            "workspace_size_bytes": workspace_size_bytes,
         }
 
         with open(source_dir / "model.h", "w") as f:
diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 22c372cc70b00..f04d8515b8dc2 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -139,6 +139,10 @@ COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh
 RUN bash /install/ubuntu_install_sccache.sh
 ENV PATH /opt/sccache:$PATH
 
+# dnnl
+COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
+RUN bash /install/ubuntu_install_dnnl.sh
+
 # Environment variables
 ENV PATH=/usr/local/nvidia/bin:${PATH}
 ENV PATH=/usr/local/cuda/bin:${PATH}
diff --git a/docker/clear-stale-images.sh b/docker/clear-stale-images.sh
new file mode 100755
index 0000000000000..1e1e4b86a4d7b
--- /dev/null
+++ b/docker/clear-stale-images.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Remove tvm-related docker images from the local system which
+# are not used by the currently-checked-out branch in this git
+# repository plus any linked worktrees.
+
+set -euo pipefail
+
+dry_run=0
+repositories=( "$(cd $(dirname "$0") && git rev-parse --show-toplevel)" )
+skip_confirm=0
+verbose=0
+while [ "${1+x}" == "x" ]; do
+    case "$1" in
+        --help|-h)
+            echo "usage: $0 [-n] [-v] [-y] <repository> [<repository> ...]"
+            echo ""
+            echo "Remove tvm-related docker images from the local system which"
+            echo "are not used by the currently-checked-out branch in this git"
+            echo "repository plus any linked worktrees."
+            echo ""
+            echo 'This command should remove only docker images beginning with "tlcpack"'
+            echo ""
+            echo "Options:"
+            echo " -n           Perform a dry-run and just print the docker rmi command"
+            echo " -v           Verbosely list the images kept and why"
+            echo " -y           Skip confirmation"
+            echo " <repository> Additional git repositories to consult."
+            exit 2
+            ;;
+        -n)
+            dry_run=1
+            ;;
+        -v)
+            verbose=1
+            ;;
+        -y)
+            skip_confirm=1
+            ;;
+        *)
+            repositories=( "${repositories[@]}" "$1" )
+            ;;
+    esac
+    shift
+done
+
+declare -a used_images
+for r in "${repositories[@]}"; do
+    if [ -d "${r}/.git" ]; then
+        worktree="${r}"
+    else
+        worktree="$(cat "${r}/.git")"
+    fi
+    while read wt; do
+        d="${wt:9:${#wt}}"  # strip "worktree " prefix
+        for img in $(cat "${d}/Jenkinsfile" | grep -E '^ci_[a-z]+ = ' | sed -E "s/ci_[a-z]+ = '([^\"]*)'/\1/"); do
+            used_images=( "${used_images[@]}" "${img}" )
+        done
+    done < <(cd "${worktree}" && git worktree list --porcelain | grep '^worktree ')
+done
+
+declare -a to_rm
+while read image; do
+    if [ "${image}" == "<none>:<none>" ]; then
+        continue
+    fi
+    grep -qE "^tlcpack" < <(echo "$image") && is_tlcpack=1 || is_tlcpack=0
+    if [ $is_tlcpack -eq 0 ]; then   # non-tlcpack image
+        if [ $verbose -ne 0 ]; then
+            echo "skipping (non-tvm): $image"
+        fi
+        continue
+    fi
+    grep -q "$image" < <(echo "${used_images[@]}") && is_used=1 || is_used=0
+    if [ $is_used -eq 1 ]; then  # Image was found in used_images
+        if [ $verbose -ne 0 ]; then
+            echo "skipping (image used): $image"
+        fi
+        continue
+    fi
+    to_rm=( "${to_rm[@]}" "${image}" )
+done < <(docker images --format '{{.Repository}}:{{.Tag}}')
+
+docker_cmd=( docker rmi "${to_rm[@]}" )
+if [ ${dry_run} -ne 0 ]; then
+    echo "would run: ${docker_cmd[@]}"
+else
+    if [ $skip_confirm -eq 0 ]; then
+        echo "will run: ${docker_cmd[@]}"
+        read -p "Proceed? [y/N] " proceed
+        if [ "${proceed-}" != "y" -a "${proceed-}" != "Y" ]; then
+            echo "Aborted."
+            exit 2
+        fi
+    fi
+    "${docker_cmd[@]}"
+fi
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 82b5c5d43f416..26989fb8e6a3b 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -113,6 +113,14 @@ each time (e.g. you can test a change in CPU and i386 while retaining incrementa
     # run the CPU build and drop into a shell in the container
     python tests/scripts/ci.py cpu --interactive
 
+We regularly update our docker images and, over time, stale images may unnecessarily consume disk
+space. You can remove stale images that aren't used in the presently checked-out branch plus any
+other worktrees using the following command:
+
+.. code:: bash
+    docker/clear-stale-images.sh
+
+Consult the ``--help`` for more options.
 
 C++ (local)
 ^^^^^^^^^^^
diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h
index 37a315bf744e9..b22d8beddbabb 100644
--- a/include/tvm/meta_schedule/database.h
+++ b/include/tvm/meta_schedule/database.h
@@ -98,6 +98,9 @@ struct WorkloadEqual {
   }
 };
 
+/*! \brief The class of measure candidates. */
+class MeasureCandidate;
+
 /*! \brief The class of tuning records. */
 class TuningRecordNode : public runtime::Object {
  public:
@@ -123,6 +126,9 @@ class TuningRecordNode : public runtime::Object {
   static constexpr const char* _type_key = "meta_schedule.TuningRecord";
   TVM_DECLARE_FINAL_OBJECT_INFO(TuningRecordNode, runtime::Object);
 
+  /*! \brief Construct the measure candidate given the initial IR module and trace
+   * stored in the tuning record. */
+  MeasureCandidate AsMeasureCandidate() const;
   /*!
    * \brief Export the tuning record to a JSON string.
    * \return An array containing the trace, running secs, serialized target, and
@@ -187,6 +193,11 @@ class DatabaseNode : public runtime::Object {
    * \return An array of top K tuning records for the given workload.
    */
   virtual Array<TuningRecord> GetTopK(const Workload& workload, int top_k) = 0;
+  /*!
+   * \brief Get all tuning records from the database.
+   * \return An Array of all the tuning records in the database.
+   */
+  virtual Array<TuningRecord> GetAllTuningRecords() = 0;
   /*!
    * \brief Get the size of the database.
    * \return The size of the database.
@@ -224,6 +235,11 @@ class PyDatabaseNode : public DatabaseNode {
    * \return An array of top K tuning records for the given workload.
    */
   using FGetTopK = runtime::TypedPackedFunc<Array<TuningRecord>(const Workload&, int)>;
+  /*!
+   * \brief The function type of `GetAllTuningRecords` method.
+   * \return An Array of all the tuning records in the database.
+   */
+  using FGetAllTuningRecords = runtime::TypedPackedFunc<Array<TuningRecord>()>;
   /*!
    * \brief The function type of `Size` method.
    * \return The size of the database.
@@ -238,6 +254,8 @@ class PyDatabaseNode : public DatabaseNode {
   FCommitTuningRecord f_commit_tuning_record;
   /*! \brief The packed function to the `GetTopK` function. */
   FGetTopK f_get_top_k;
+  /*! \brief The packed function to the `GetAllTuningRecords` function. */
+  FGetAllTuningRecords f_get_all_tuning_records;
   /*! \brief The packed function to the `Size` function. */
   FSize f_size;
 
@@ -249,6 +267,7 @@ class PyDatabaseNode : public DatabaseNode {
     // `f_commit_workload` is not visited
     // `f_commit_tuning_record` is not visited
     // `f_get_top_k` is not visited
+    // `f_get_all_tuning_records` is not visited
     // `f_size` is not visited
   }
 
@@ -273,6 +292,12 @@ class PyDatabaseNode : public DatabaseNode {
     return f_get_top_k(workload, top_k);
   }
 
+  Array<TuningRecord> GetAllTuningRecords() final {
+    ICHECK(f_get_all_tuning_records != nullptr)
+        << "PyDatabase's GetAllTuningRecords method not implemented!";
+    return f_get_all_tuning_records();
+  }
+
   int64_t Size() final {
     ICHECK(f_size != nullptr) << "PyDatabase's Size method not implemented!";
     return f_size();
@@ -302,6 +327,7 @@ class Database : public runtime::ObjectRef {
    * \param f_commit_workload The packed function of `CommitWorkload`.
    * \param f_commit_tuning_record The packed function of `CommitTuningRecord`.
    * \param f_get_top_k The packed function of `GetTopK`.
+   * \param f_get_all_tuning_records The packed function of `GetAllTuningRecords`.
    * \param f_size The packed function of `Size`.
    * \return The created database.
    */
@@ -309,6 +335,7 @@ class Database : public runtime::ObjectRef {
                                      PyDatabaseNode::FCommitWorkload f_commit_workload,
                                      PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
                                      PyDatabaseNode::FGetTopK f_get_top_k,
+                                     PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
                                      PyDatabaseNode::FSize f_size);
   TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Database, runtime::ObjectRef, DatabaseNode);
 };
diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 8dc778e8f49ca..5fbc875d1eda9 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -28,6 +28,7 @@
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.utils import cpu_count
 from tvm.relay.frontend import from_onnx
+from tvm.support import describe
 
 
 def _parse_args():
@@ -148,6 +149,7 @@ def main():
     else:
         raise NotImplementedError(f"Unsupported target {ARGS.target}")
 
+    describe()
     print(f"Workload: {ARGS.model_name}")
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py
index f1a2b573a69be..58ea327ec50b6 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -146,6 +146,9 @@ def main():
         )
     else:
         raise NotImplementedError(f"Unsupported target {ARGS.target}")
+
+    describe()
+    print(f"Workload: {ARGS.workload}")
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
@@ -153,7 +156,6 @@ def main():
     )
     input_info = {input_name: input_shape}
     input_data = {}
-    print(f"Workload: {ARGS.workload}")
     for input_name, input_shape in input_info.items():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py
index d640a32c29419..4a6874a53d347 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -91,6 +91,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json")
     workload_func, params = CONFIGS[ARGS.workload]
     params = params[0]  # type: ignore
diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py
index a50fffaa2b434..5ef309bb28080 100644
--- a/python/tvm/contrib/pipeline_executor.py
+++ b/python/tvm/contrib/pipeline_executor.py
@@ -55,6 +55,7 @@ def __init__(self, module):
         self._get_input = self.module["get_input"]
         self._get_output = self.module["get_output"]
         self._get_num_outputs = self.module["get_num_outputs"]
+        self._get_num_inputs = self.module["get_num_inputs"]
         self._get_input_pipeline_map = self.module["get_input_pipeline_map"]
         self._get_pipe_execute_count = self.module["get_execute_count"]
 
@@ -159,6 +160,16 @@ def num_outputs(self):
         """
         return self._get_num_outputs()
 
+    @property
+    def num_inputs(self):
+        """Get the number of inputs
+        Returns
+        -------
+        count : int
+            The number of inputs
+        """
+        return self._get_num_inputs()
+
     @staticmethod
     def load_library(config_file_name):
         """Import files to create a pipeline executor.
diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py
index 04946ec9c6d0c..5f40d21223125 100644
--- a/python/tvm/driver/tvmc/model.py
+++ b/python/tvm/driver/tvmc/model.py
@@ -391,9 +391,20 @@ def import_package(self, package_path: str):
             with open(temp.relpath("metadata.json")) as metadata_json:
                 metadata = json.load(metadata_json)
 
-            has_graph_executor = "graph" in metadata["executors"]
-            graph = temp.relpath("executor-config/graph/graph.json") if has_graph_executor else None
-            params = temp.relpath(f'parameters/{metadata["model_name"]}.params')
+            all_module_names = []
+            for name in metadata["modules"].keys():
+                all_module_names.append(name)
+            assert len(all_module_names) == 1, "Multiple modules in MLF is not supported."
+
+            module_name = all_module_names[0]
+            module_metdata = metadata["modules"][module_name]
+            has_graph_executor = "graph" in module_metdata["executors"]
+            graph = (
+                temp.relpath(f"executor-config/graph/{module_name}.graph")
+                if has_graph_executor
+                else None
+            )
+            params = temp.relpath(f"parameters/{module_name}.params")
 
             self.type = "mlf"
         else:
diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py
index 802a739e69582..0c11f77591cc6 100644
--- a/python/tvm/meta_schedule/database/database.py
+++ b/python/tvm/meta_schedule/database/database.py
@@ -115,6 +115,17 @@ def __init__(  # type: ignore # pylint: disable=too-many-arguments
             args_info,
         )
 
+    def as_measure_candidate(self) -> Any:
+        """Generate a measure candidate given an initial IR module and a trace
+        stored in the tuning record.
+
+        Returns
+        -------
+        candidate : MeasureCandidate
+            A generated candidate.
+        """
+        return _ffi_api.TuningRecordAsMeasureCandidate(self)  # type: ignore # pylint: disable=no-member
+
     def as_json(self) -> Any:
         """Export the tuning record to a JSON string.
 
@@ -203,6 +214,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
         """
         return _ffi_api.DatabaseGetTopK(self, workload, top_k)  # type: ignore # pylint: disable=no-member
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        """Get all the tuning records from the database.
+
+        Returns
+        -------
+        tuning_records : List[TuningRecord]
+            All tuning records from the database.
+        """
+        return _ffi_api.DatabaseGetAllTuningRecords(self)  # type: ignore # pylint: disable=no-member
+
     def __len__(self) -> int:
         """Get the number of records in the database.
 
@@ -229,6 +250,7 @@ def __init__(
         f_commit_workload: Callable = None,
         f_commit_tuning_record: Callable = None,
         f_get_top_k: Callable = None,
+        f_get_all_tuning_records: Callable = None,
         f_size: Callable = None,
     ):
         """Constructor."""
@@ -239,6 +261,7 @@ def __init__(
             f_commit_workload,
             f_commit_tuning_record,
             f_get_top_k,
+            f_get_all_tuning_records,
             f_size,
         )
 
@@ -258,6 +281,7 @@ class PyDatabase:
             "commit_workload",
             "commit_tuning_record",
             "get_top_k",
+            "get_all_tuning_records",
             "__len__",
         ],
     }
@@ -317,6 +341,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
         """
         raise NotImplementedError
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        """Get all the tuning records from the database.
+
+        Returns
+        -------
+        tuning_records : List[TuningRecord]
+            All tuning records from the database.
+        """
+        raise NotImplementedError
+
     def __len__(self) -> int:
         """Get the number of records in the database.
 
diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py
index 6d10e4b5272a9..95d937cc77aad 100644
--- a/python/tvm/meta_schedule/database/memory_database.py
+++ b/python/tvm/meta_schedule/database/memory_database.py
@@ -56,6 +56,9 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]:
             )
         )[: int(top_k)]
 
+    def get_all_tuning_records(self) -> List[TuningRecord]:
+        return self.records
+
     def __len__(self) -> int:
         return len(self.records)
 
diff --git a/python/tvm/meta_schedule/runner/config.py b/python/tvm/meta_schedule/runner/config.py
index 585b88ed9939c..e3b5364dee22d 100644
--- a/python/tvm/meta_schedule/runner/config.py
+++ b/python/tvm/meta_schedule/runner/config.py
@@ -28,9 +28,14 @@ class EvaluatorConfig(NamedTuple):
     Parameters
     ----------
     number: int
-        The number of runs.
+        The number of times to run this function for taking average.
+        We call these runs as one `repeat` of measurement.
     repeat: int
-        The number of times to repeat in each run.
+        The number of times to repeat the measurement.
+        In total, the function will be invoked (1 + number x repeat) times,
+        where the first one is warm up and will be discarded.
+        The returned result contains `repeat` costs,
+        each of which is an average of `number` costs.
     min_repeat_ms: int
         Minimum repeat time in ms. if the execution latency is too short,
         increase the number of runs to the given time (in ms) to reduce the measurement error.
diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
index c80d78173e2e4..35b872e7351e8 100644
--- a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
+++ b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py
@@ -103,6 +103,14 @@ def sample_candidates(task, task_name, model_name):
     -------
     None
     """
+    candidate_path = os.path.join(
+        args.candidate_cache_dir, model_name, task_name + "_candidates.json"
+    )
+    workload_path = os.path.join(args.candidate_cache_dir, model_name, task_name + "_workload.json")
+    database = ms.database.JSONDatabase(
+        path_workload=workload_path,
+        path_tuning_record=candidate_path,
+    )
     sample_init_population = tvm.get_global_func(
         "meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation"
     )
@@ -128,7 +136,7 @@ def sample_candidates(task, task_name, model_name):
     context.initialize()
     context.pre_tuning(
         context.generate_design_space(),
-        database=ms.database.MemoryDatabase(),  # type: ignore
+        database=database,
         cost_model=ms.cost_model.RandomModel(),  # type: ignore
     )
 
@@ -148,16 +156,9 @@ def sample_candidates(task, task_name, model_name):
     all_states = all_states[: args.num_samples_per_task]
 
     workload = ms.database.Workload(context.mod)
-    file_path = os.path.join(args.candidate_cache_dir, model_name, task_name + ".json")
-    with open(file_path, "w", encoding="utf8") as file:
-        for i, state in enumerate(all_states):
-            tuning_record = ms.database.TuningRecord(state.trace, workload)
-            json_str = json.dumps(tuning_record.as_json())
-            assert "\n" not in json_str, "Failed to generate single line string."
-            if i == len(all_states) - 1:
-                file.write(json_str)
-            else:
-                file.write(json_str + "\n")
+    database.commit_workload(context.mod)
+    for state in all_states:
+        database.commit_tuning_record(ms.database.TuningRecord(state.trace, workload))
 
 
 args = _parse_args()  # pylint: disable=invalid-name
diff --git a/python/tvm/meta_schedule/testing/distributed_measure_candidates.py b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py
new file mode 100644
index 0000000000000..8e646c4846724
--- /dev/null
+++ b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py
@@ -0,0 +1,198 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=missing-docstring
+
+import argparse
+import glob
+import os
+
+from tqdm import tqdm  # type: ignore
+from tvm import meta_schedule as ms
+from tvm.target import Target
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--candidate_cache_dir", type=str, help="Please provide the full path to the candidates."
+    )
+    parser.add_argument(
+        "--result_cache_dir", type=str, help="Please provide the full path to the result database."
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="nvidia/nvidia-v100",
+        help="Please specify the target hardware for tuning context.",
+    )
+    parser.add_argument(
+        "--rpc_host", type=str, help="Please provide the private IPv4 address for the tracker."
+    )
+    parser.add_argument(
+        "--rpc_port", type=int, default=4445, help="Please provide the port for the tracker."
+    )
+    parser.add_argument(
+        "--rpc_key",
+        type=str,
+        default="p3.2xlarge",
+        help="Please provide the key for the rpc servers.",
+    )
+    parser.add_argument(
+        "--builder_timeout_sec",
+        type=int,
+        default=10,
+        help="The time for the builder session to time out.",
+    )
+    parser.add_argument(
+        "--min_repeat_ms", type=int, default=100, help="The time for preheating the gpu."
+    )
+    parser.add_argument(
+        "--runner_timeout_sec",
+        type=int,
+        default=100,
+        help="The time for the runner session to time out.",
+    )
+    parser.add_argument(
+        "--cpu_flush", type=bool, default=False, help="Whether to enable cpu cache flush or not."
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=128,
+        help="The batch size of candidates sent to builder and runner each time.",
+    )
+    return parser.parse_args()
+
+
+# pylint: disable=too-many-locals
+def measure_candidates(database, builder, runner):
+    """Send the candidates to builder and runner for distributed measurement,
+    and save the results in a new json database.
+
+    Parameters
+    ----------
+    database : JSONDatabase
+        The database for candidates to be measured.
+    builder : Builder
+        The builder for building the candidates.
+    runner : Runner
+        The runner for measuring the candidates.
+
+    Returns
+    -------
+    None
+    """
+    candidates, runner_results, build_fail_indices, run_fail_indices = [], [], [], []
+    context = ms.TuneContext(target=Target(args.target))
+    tuning_records = database.get_all_tuning_records()
+    for record in tuning_records:
+        candidates.append(record.as_measure_candidate())
+    with ms.Profiler() as profiler:
+        for idx in range(0, len(candidates), args.batch_size):
+            batch_candidates = candidates[idx : idx + args.batch_size]
+            context._set_measure_candidates(batch_candidates)  # pylint: disable=protected-access
+            with ms.Profiler.timeit("build"):
+                context._send_to_builder(builder)  # pylint: disable=protected-access
+            with ms.Profiler.timeit("run"):
+                context._send_to_runner(runner)  # pylint: disable=protected-access
+                batch_runner_results = context._join()  # pylint: disable=protected-access
+            runner_results.extend(batch_runner_results)
+            for i, result in enumerate(context.builder_results):
+                if result.error_msg is None:
+                    ms.utils.remove_build_dir(result.artifact_path)
+                else:
+                    build_fail_indices.append(i + idx)
+            context._clear_measure_state()  # pylint: disable=protected-access
+
+    model_name, workload_name = database.path_workload.split("/")[-2:]
+    record_name = database.path_tuning_record.split("/")[-1]
+    new_database = ms.database.JSONDatabase(
+        path_workload=os.path.join(args.result_cache_dir, model_name, workload_name),
+        path_tuning_record=os.path.join(args.result_cache_dir, model_name, record_name),
+    )
+    workload = tuning_records[0].workload
+    new_database.commit_workload(workload.mod)
+    for i, (record, result) in enumerate(zip(tuning_records, runner_results)):
+        if result.error_msg is None:
+            new_database.commit_tuning_record(
+                ms.database.TuningRecord(
+                    trace=record.trace,
+                    workload=workload,
+                    run_secs=[v.value for v in result.run_secs],
+                    target=Target(args.target),
+                )
+            )
+        else:
+            run_fail_indices.append(i)
+    fail_indices_name = workload_name.replace("_workload.json", "_failed_indices.txt")
+    with open(
+        os.path.join(args.result_cache_dir, model_name, fail_indices_name), "w", encoding="utf8"
+    ) as file:
+        file.write(" ".join([str(n) for n in run_fail_indices]))
+    print(
+        f"Builder time: {profiler.get()['build']}, Runner time: {profiler.get()['run']}\n\
+            Failed number of builds: {len(build_fail_indices)},\
+            Failed number of runs: {len(run_fail_indices)}"
+    )
+
+
+args = _parse_args()  # pylint: disable=invalid-name
+
+
+def main():
+    builder = ms.builder.LocalBuilder(timeout_sec=args.builder_timeout_sec)
+    runner = ms.runner.RPCRunner(
+        rpc_config=ms.runner.RPCConfig(
+            tracker_host=args.rpc_host,
+            tracker_port=args.rpc_port,
+            tracker_key=args.rpc_key,
+            session_timeout_sec=args.runner_timeout_sec,
+        ),
+        evaluator_config=ms.runner.EvaluatorConfig(
+            number=3,
+            repeat=1,
+            min_repeat_ms=args.min_repeat_ms,
+            enable_cpu_cache_flush=args.cpu_flush,
+        ),
+        max_workers=os.cpu_count(),
+    )
+    if not os.path.isdir(args.candidate_cache_dir):
+        raise Exception("Please provide a correct candidate cache dir.")
+    try:
+        os.makedirs(args.result_cache_dir, exist_ok=True)
+    except OSError:
+        print(f"Directory {args.result_cache_dir} cannot be created successfully.")
+    model_dirs = glob.glob(os.path.join(args.candidate_cache_dir, "*"))
+    for model_dir in model_dirs:
+        model_name = model_dir.split("/")[-1]
+        os.makedirs(os.path.join(args.result_cache_dir, model_name), exist_ok=True)
+        all_tasks = glob.glob(os.path.join(model_dir, "*.json"))
+        workload_paths = []
+        for path in all_tasks:
+            if path.endswith("_workload.json"):
+                workload_paths.append(path)
+        for workload_path in tqdm(workload_paths):
+            candidate_path = workload_path.replace("_workload.json", "_candidates.json")
+            database = ms.database.JSONDatabase(
+                path_workload=workload_path,
+                path_tuning_record=candidate_path,
+            )
+            measure_candidates(database, builder, runner)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py
index 2409041c31760..88cb360c0171b 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -25,6 +25,7 @@
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.relay.frontend import from_onnx
+from tvm.support import describe
 
 
 def _parse_args():
@@ -115,6 +116,7 @@ def _parse_args():
 
 
 def main():
+    describe()
     print(f"Workload: {ARGS.model_name}")
     onnx_model = onnx.load(ARGS.onnx_path)
     shape_dict = {}
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py
index ee1eec2aa0558..ce15c60c15e68 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -24,6 +24,7 @@
 from tvm import meta_schedule as ms
 from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
 from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.support import describe
 
 
 def _parse_args():
@@ -113,6 +114,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     mod, params, (input_name, input_shape, input_dtype) = get_network(
         ARGS.workload,
         ARGS.input_shape,
@@ -120,7 +123,6 @@ def main():
     )
     input_info = {input_name: input_shape}
     input_data = {}
-    print(f"Workload: {ARGS.workload}")
     for input_name, input_shape in input_info.items():
         print(f"  input_name: {input_name}")
         print(f"  input_shape: {input_shape}")
diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py
index 8caa023036fa7..8740d74424781 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -24,6 +24,7 @@
 from tvm import meta_schedule as ms
 from tvm import tir
 from tvm.meta_schedule.testing.te_workload import create_te_workload
+from tvm.support import describe
 
 
 def _parse_args():
@@ -102,6 +103,8 @@ def _parse_args():
 
 
 def main():
+    describe()
+    print(f"Workload: {ARGS.workload}")
     runner = ms.runner.RPCRunner(
         rpc_config=ARGS.rpc_config,
         evaluator_config=ms.runner.EvaluatorConfig(
diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py
index b7975e7b2c4e9..30c726ded25bb 100644
--- a/python/tvm/meta_schedule/tune_context.py
+++ b/python/tvm/meta_schedule/tune_context.py
@@ -171,6 +171,50 @@ def __init__(
         )
         _ffi_api.TuneContextInitialize(self)  # type: ignore # pylint: disable=no-member
 
+    def _set_measure_candidates(self, candidates):
+        """Set candidates in a tuning context.
+
+        Parameters
+        ----------
+        candidates : List[MeasureCandidate]
+            A list of measure candidates for the tuning context.
+        """
+        _ffi_api.TuneContextSetMeasureCandidates(self, candidates)  # type: ignore # pylint: disable=no-member
+
+    def _send_to_builder(self, builder):
+        """Send candidates to builder.
+
+        Parameters
+        ----------
+        builder : Builder
+            The builder for building the candidates.
+        """
+        _ffi_api.TuneContextSendToBuilder(self, builder)  # type: ignore # pylint: disable=no-member
+
+    def _send_to_runner(self, runner):
+        """Send candidates to runner.
+
+        Parameters
+        ----------
+        runner : Runner
+            The runner for running the candidates.
+        """
+        _ffi_api.TuneContextSendToRunner(self, runner)  # type: ignore # pylint: disable=no-member
+
+    def _join(self):
+        """Join the runner processes.
+
+        Returns
+        -------
+        result : List[RunnerResult]
+            The runner results.
+        """
+        return _ffi_api.TuneContextJoin(self)  # type: ignore # pylint: disable=no-member
+
+    def _clear_measure_state(self):
+        """Clear the measure states."""
+        _ffi_api.TuneContextClearMeasureState(self)  # type: ignore # pylint: disable=no-member
+
     def generate_design_space(self) -> List[Schedule]:
         """Generate design spaces given a module.
 
diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py
index aec5912871fd5..814f98f1b7882 100644
--- a/python/tvm/micro/contrib/stm32/emitter.py
+++ b/python/tvm/micro/contrib/stm32/emitter.py
@@ -482,8 +482,18 @@ def parse_library_format(self, model_library_format_path, quantization=None):
         with tarfile.TarFile(model_library_format_path) as f:
             f.extractall(extract_path)
 
+        with open(os.path.join(extract_path, "metadata.json")) as metadata_f:
+            metadata = json.load(metadata_f)
+
+        all_module_names = []
+        for name in metadata["modules"].keys():
+            all_module_names.append(name)
+        assert len(metadata["modules"]) == 1, "Multiple modules is not supported."
+
         # Extract informations from the Model Library Format
-        graph_file = os.path.join(extract_path, "executor-config", "graph", "graph.json")
+        graph_file = os.path.join(
+            extract_path, "executor-config", "graph", f"{all_module_names[0]}.graph"
+        )
         with open(graph_file, "r") as f:
             # returns JSON object as a dictionary
             graph_dict = json.load(f)
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index 1dd63b319dbd5..e220fa1ca5430 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -39,6 +39,7 @@
 # This should be kept identical to runtime::symbol::tvm_module_main
 MAIN_FUNC_NAME_STR = "__tvm_main__"
 STANDALONE_CRT_URL = "./runtime"
+METADATA_FILE = "metadata.json"
 
 
 class UnsupportedInModelLibraryFormatError(Exception):
@@ -67,56 +68,78 @@ def generate_c_interface_header(
 EPHEMERAL_MODULE_TYPE_KEYS = ("metadata_module",)
 
 
-def _populate_codegen_dir(mod, codegen_dir: str, module_name: str = None):
+def _populate_codegen_dir(
+    mods: typing.Union[
+        typing.List[executor_factory.ExecutorFactoryModule],
+        typing.List[tvm.runtime.Module],
+    ],
+    codegen_dir: str,
+):
     """Populate the codegen sub-directory as part of a Model Library Format export.
 
     Parameters
     ----------
-    mod : tvm.runtime.Module
-        Module which should be written to codegen_dir.
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule], List[tvm.runtime.Module]
+        A list of the return value of tvm.relay.build, which
+        will be exported into Model Library Format.
     codegen_dir : str
         Path to the codegen directory on disk.
     module_name: Optional[str]
         Name used to prefix the generated source files
 
     """
-    dso_modules = mod._collect_dso_modules()
-    non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules)
+    dso_modules = []
+    for mod in mods:
+        if isinstance(mod, executor_factory.ExecutorFactoryModule):
+            lib = mod.lib
+        elif isinstance(mod, tvm.runtime.Module):
+            lib = mod
+        else:
+            raise RuntimeError(f"Not supported module type: {type(mod)}")
 
-    # Filter ephemeral modules which cannot be exported.
-    dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
-    non_dso_modules = [m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+        dso_modules = lib._collect_dso_modules()
+        non_dso_modules = lib._collect_from_import_tree(lambda m: m not in dso_modules)
 
-    if non_dso_modules:
-        raise UnsupportedInModelLibraryFormatError(
-            f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"
-        )
+        # Filter ephemeral modules which cannot be exported.
+        dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS]
+        non_dso_modules = [
+            m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS
+        ]
 
-    mod_indices = {"lib": 0, "src": 0}
-    host_codegen_dir = os.path.join(codegen_dir, "host")
-    lib_name = f"{module_name}_lib" if module_name else "lib"
-
-    for dso_mod in dso_modules:
-        if dso_mod.type_key == "c":
-            assert dso_mod.format in ["c", "cc", "cpp"]
-            ext = dso_mod.format
-            index = mod_indices["src"]
-            mod_indices["src"] += 1
-            parent_dir = os.path.join(host_codegen_dir, "src")
-            file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}")
-        elif dso_mod.type_key == "llvm":
-            index = mod_indices["lib"]
-            mod_indices["lib"] += 1
-            parent_dir = os.path.join(host_codegen_dir, "lib")
-            file_name = os.path.join(parent_dir, f"{lib_name}{index}.o")
-        else:
-            assert (
-                False
-            ), f"do not expect module with type_key={mod.type_key} from _collect_dso_modules"
+        if non_dso_modules:
+            raise UnsupportedInModelLibraryFormatError(
+                f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}"
+            )
+
+        mod_indices = {"lib": 0, "src": 0}
+        host_codegen_dir = os.path.join(codegen_dir, "host")
+        lib_name = (
+            f"{mod.libmod_name}_lib"
+            if isinstance(mod, executor_factory.ExecutorFactoryModule)
+            else "lib"
+        )
 
-        if not os.path.exists(parent_dir):
-            os.makedirs(parent_dir)
-        dso_mod.save(file_name)
+        for dso_mod in dso_modules:
+            if dso_mod.type_key == "c":
+                assert dso_mod.format in ["c", "cc", "cpp"]
+                ext = dso_mod.format
+                index = mod_indices["src"]
+                mod_indices["src"] += 1
+                parent_dir = os.path.join(host_codegen_dir, "src")
+                file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}")
+            elif dso_mod.type_key == "llvm":
+                index = mod_indices["lib"]
+                mod_indices["lib"] += 1
+                parent_dir = os.path.join(host_codegen_dir, "lib")
+                file_name = os.path.join(parent_dir, f"{lib_name}{index}.o")
+            else:
+                assert (
+                    False
+                ), f"do not expect module with type_key={lib.type_key} from _collect_dso_modules"
+
+            if not os.path.exists(parent_dir):
+                os.makedirs(parent_dir)
+            dso_mod.save(file_name)
 
 
 def _build_memory_map(mod):
@@ -297,7 +320,7 @@ def _should_generate_interface_header(mod):
     return "interface-api" in mod.executor and mod.executor["interface-api"] == "c"
 
 
-def _make_tar(source_dir, tar_file_path, mod):
+def _make_tar(source_dir, tar_file_path, modules):
     """Build a tar file from source_dir."""
     with tarfile.open(tar_file_path, "w") as tar_f:
 
@@ -307,91 +330,127 @@ def reset(tarinfo):
             return tarinfo
 
         tar_f.add(str(source_dir), arcname=".", filter=reset)
-        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-        if is_aot and str(mod.runtime) == "crt":
-            tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL)
+
+        for mod in modules:
+            is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+            if is_aot and str(mod.runtime) == "crt":
+                tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL)
+                break
 
 
-_GENERATED_VERSION = 6
+_GENERATED_VERSION = 7
+
+
+def _is_module_names_unique(mods: typing.List[executor_factory.ExecutorFactoryModule]):
+    """Check if built modules have unique names.
+
+    Parameters
+    ----------
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule]
+        A list of the return value of tvm.relay.build,
+        which will be exported into Model Library Format.
+    """
+    all_names = []
+    for mod in mods:
+        all_names.append(mod.libmod_name)
+
+    return len(set(all_names)) == len(all_names)
 
 
 def _export_graph_model_library_format(
-    mod: executor_factory.ExecutorFactoryModule, tempdir: pathlib.Path
+    mods: typing.List[executor_factory.ExecutorFactoryModule], tempdir: pathlib.Path
 ):
     """Export a tvm.relay.build artifact in Model Library Format.
 
     Parameters
     ----------
-    mod : tvm.relay.backend.executor_factory.ExecutorFactoryModule
-        The return value of tvm.relay.build, which will be exported into Model Library Format.
+    mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule]
+        A list of the return value of tvm.relay.build,
+        which will be exported into Model Library Format.
     tempdir : pathlib.Path
         Temporary directory to populate with Model Library Format contents.
     """
-    is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
-    executor = ["aot"] if is_aot else ["graph"]
+
+    assert _is_module_names_unique(mods), "Multiple modules should have unique names."
 
     metadata = {
         "version": _GENERATED_VERSION,
-        "model_name": mod.libmod_name,
-        "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
-        "memory": _build_memory_map(mod),
-        "target": [str(t) for t in mod.target],
-        "executors": executor,
-        "style": "full-model",
     }
-
-    if is_aot and (str(mod.runtime) == "crt"):
-        standalone_crt = {
-            "short_name": "tvm_standalone_crt",
-            "url": f"{STANDALONE_CRT_URL}",
-            "url_type": "mlf_path",
-            "version_spec": f"{tvm.__version__}",
+    metadata["modules"] = {}
+    for mod in mods:
+        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+        executor = ["aot"] if is_aot else ["graph"]
+        module_name = mod.libmod_name
+        metadata["modules"][module_name] = {
+            "model_name": module_name,
+            "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"),
+            "memory": _build_memory_map(mod),
+            "target": [str(t) for t in mod.target],
+            "executors": executor,
+            "style": "full-model",
         }
-        external_dependencies = [standalone_crt]
-        metadata["external_dependencies"] = external_dependencies
 
-    with open(tempdir / "metadata.json", "w") as json_f:
+        if is_aot and (str(mod.runtime) == "crt"):
+            standalone_crt = {
+                "short_name": "tvm_standalone_crt",
+                "url": f"{STANDALONE_CRT_URL}",
+                "url_type": "mlf_path",
+                "version_spec": f"{tvm.__version__}",
+            }
+            external_dependencies = [standalone_crt]
+            metadata["modules"][module_name]["external_dependencies"] = external_dependencies
+
+    with open(tempdir / METADATA_FILE, "w") as json_f:
         json.dump(metadata, json_f, indent=2, sort_keys=True)
 
     codegen_dir = tempdir / "codegen"
     codegen_dir.mkdir()
-    _populate_codegen_dir(mod.lib, codegen_dir, mod.libmod_name)
-
-    if _should_generate_interface_header(mod):
-        include_path = codegen_dir / "host" / "include"
-        include_path.mkdir()
-        inputs, outputs = _get_inputs_and_outputs_from_module(mod)
-        devices = mod.get_devices()
-        pools = _get_pools_from_module(mod)
-        io_pool_allocations = _get_io_pool_allocation_from_module(mod)
-        workspace_size = int(metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"])
-        generate_c_interface_header(
-            mod.libmod_name,
-            inputs,
-            outputs,
-            pools,
-            io_pool_allocations,
-            devices,
-            workspace_size,
-            include_path,
-        )
+    _populate_codegen_dir(mods, codegen_dir)
 
     parameters_dir = tempdir / "parameters"
     parameters_dir.mkdir()
-    param_filename = parameters_dir / f"{mod.libmod_name}.params"
-    with open(param_filename, "wb") as f:
-        f.write(param_dict.save_param_dict(mod.params))
-
     src_dir = tempdir / "src"
     src_dir.mkdir()
-    with open(src_dir / "relay.txt", "w") as f:
-        f.write(str(mod.ir_mod))
+    graph_config_dir = tempdir / "executor-config" / "graph"
+    for mod in mods:
+        if _should_generate_interface_header(mod):
+            include_path = codegen_dir / "host" / "include"
+            if not include_path.exists():
+                include_path.mkdir()
+
+            inputs, outputs = _get_inputs_and_outputs_from_module(mod)
+            devices = mod.get_devices()
+            pools = _get_pools_from_module(mod)
+            io_pool_allocations = _get_io_pool_allocation_from_module(mod)
+            workspace_size = int(
+                metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][
+                    "workspace_size_bytes"
+                ]
+            )
+            generate_c_interface_header(
+                mod.libmod_name,
+                inputs,
+                outputs,
+                pools,
+                io_pool_allocations,
+                devices,
+                workspace_size,
+                include_path,
+            )
+
+        is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule)
+        param_filename = parameters_dir / f"{mod.libmod_name}.params"
+        with open(param_filename, "wb") as f:
+            f.write(param_dict.save_param_dict(mod.params))
 
-    if not is_aot:
-        graph_config_dir = tempdir / "executor-config" / "graph"
-        graph_config_dir.mkdir(parents=True)
-        with open(graph_config_dir / "graph.json", "w") as f:
-            f.write(mod.get_executor_config())
+        with open(src_dir / f"{mod.libmod_name}.relay", "w") as f:
+            f.write(str(mod.ir_mod))
+
+        if not is_aot:
+            if not graph_config_dir.exists():
+                graph_config_dir.mkdir(parents=True)
+            with open(graph_config_dir / f"{mod.libmod_name}.graph", "w") as f:
+                f.write(mod.get_executor_config())
 
 
 class NonStaticShapeError(Exception):
@@ -451,14 +510,11 @@ def _eval_shape(param_name, buffer_shape):
 
 def _export_operator_model_library_format(mod: build_module.OperatorModule, tempdir):
     """Export the result of tvm.build() in Model Library Format.
-
     Parameters
     ----------
     mod : runtime.Module
         The Module returned from tvm.build().
-    args : list of Buffer or Tensor or Var, optional
-        The args supplied to tvm.build().
-    file_name : str
+    tempdir : str
         Path to the .tar archive to generate.
     """
     targets = []
@@ -484,12 +540,12 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
         "executors": [],
         "style": "operator",
     }
-    with open(tempdir / "metadata.json", "w") as metadata_f:
+    with open(tempdir / METADATA_FILE, "w") as metadata_f:
         json.dump(metadata, metadata_f)
 
     codegen_dir = tempdir / "codegen"
     codegen_dir.mkdir()
-    _populate_codegen_dir(mod, codegen_dir)
+    _populate_codegen_dir(list([mod]), codegen_dir)
 
 
 ExportableModule = typing.Union[
@@ -499,7 +555,10 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp
 ]
 
 
-def export_model_library_format(mod: ExportableModule, file_name: typing.Union[str, pathlib.Path]):
+def export_model_library_format(
+    mods: typing.Union[ExportableModule, typing.List[ExportableModule]],
+    file_name: typing.Union[str, pathlib.Path],
+):
     """Export the build artifact in Model Library Format.
 
     This function creates a .tar archive containing the build artifacts in a standardized
@@ -508,7 +567,7 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s
 
     Parameters
     ----------
-    mod : ExportableModule
+    mod : ExportableModule, List[ExportableModule]
         The return value of tvm.build or tvm.relay.build.
     file_name : str
         Path to the .tar archive to generate.
@@ -518,20 +577,36 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s
     file_name : str
         The path to the generated .tar archive.
     """
-    file_name = pathlib.Path(file_name)
+    modules = mods
+    if not isinstance(mods, list):
+        modules = list([mods])
+
+    operator_module_type = all(isinstance(mod, build_module.OperatorModule) for mod in modules)
+    graph_module_type = all(
+        isinstance(
+            mod,
+            (
+                executor_factory.AOTExecutorFactoryModule,
+                executor_factory.GraphExecutorFactoryModule,
+            ),
+        )
+        for mod in modules
+    )
 
+    file_name = pathlib.Path(file_name)
     tempdir = utils.tempdir()
 
-    if isinstance(mod, build_module.OperatorModule):
-        _export_operator_model_library_format(mod, tempdir.path)
-    elif isinstance(
-        mod,
-        (executor_factory.AOTExecutorFactoryModule, executor_factory.GraphExecutorFactoryModule),
-    ):
-        _export_graph_model_library_format(mod, tempdir.path)
+    if operator_module_type:
+        if len(modules) != 1:
+            raise RuntimeError("Multiple operator is not supported.")
+        _export_operator_model_library_format(modules[0], tempdir.path)
+    elif graph_module_type:
+        _export_graph_model_library_format(modules, tempdir.path)
     else:
-        raise NotImplementedError(f"Don't know how to export module of type {mod.__class__!r}")
+        raise NotImplementedError(
+            f"Don't know how to export module of type {modules[0].__class__!r}"
+        )
 
-    _make_tar(tempdir.path, file_name, mod)
+    _make_tar(tempdir.path, file_name, modules)
 
     return file_name
diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py
index 81e29a92a86a1..a48c8dc3230fb 100644
--- a/python/tvm/micro/testing/utils.py
+++ b/python/tvm/micro/testing/utils.py
@@ -24,6 +24,8 @@
 import time
 from typing import Union
 
+import tvm
+from tvm import relay
 from tvm.micro.project_api.server import IoTimeoutError
 
 # Timeout in seconds for AOT transport.
@@ -77,9 +79,36 @@ def _read_line(transport, timeout_sec: int) -> str:
 def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[pathlib.Path, str]) -> int:
     """Extract an MLF archive file and read workspace size from metadata file."""
 
+    workspace_size = 0
     with tarfile.open(mlf_tar_path, "r:*") as tar_file:
         tar_members = [ti.name for ti in tar_file.getmembers()]
         assert "./metadata.json" in tar_members
         with tar_file.extractfile("./metadata.json") as f:
             metadata = json.load(f)
-            return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
+            for mod_name in metadata["modules"].keys():
+                workspace_size += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][
+                    "workspace_size_bytes"
+                ]
+            return workspace_size
+
+
+def get_conv2d_relay_module():
+    """Generate a conv2d Relay module for testing."""
+    data_shape = (1, 3, 64, 64)
+    weight_shape = (8, 3, 5, 5)
+    data = relay.var("data", relay.TensorType(data_shape, "int8"))
+    weight = relay.var("weight", relay.TensorType(weight_shape, "int8"))
+    y = relay.nn.conv2d(
+        data,
+        weight,
+        padding=(2, 2),
+        channels=8,
+        kernel_size=(5, 5),
+        data_layout="NCHW",
+        kernel_layout="OIHW",
+        out_dtype="int32",
+    )
+    f = relay.Function([data, weight], y)
+    mod = tvm.IRModule.from_expr(f)
+    mod = relay.transform.InferType()(mod)
+    return mod
diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 09831929e5277..8d714b7269d9c 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -223,7 +223,6 @@ def qnn_max_pool2d_pattern():
     def check_qnn_max_pool2d(pattern):
         """Check if max pool2d is supported by CMSIS-NN."""
         output = pattern
-        input_op = None
 
         if str(pattern.op.name) == "clip":
             pooling = pattern.args[0]
diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py
index 6581f10a2f568..c251b66bfbc77 100644
--- a/python/tvm/relay/op/contrib/dnnl.py
+++ b/python/tvm/relay/op/contrib/dnnl.py
@@ -39,13 +39,14 @@
 from tvm.relay import transform
 from tvm.relay.expr import GlobalVar
 from tvm.relay.expr_functor import ExprMutator, ExprVisitor
+from tvm.relay.expr import const
 
 from tvm.relay.analysis import analysis as _analysis
 from tvm.relay import expr as _expr
 
 
 from ... import _ffi_api
-from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback
+from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr, rewrite, DFPatternCallback
 from .register import register_pattern_table
 
 
@@ -56,8 +57,8 @@ def _register_external_op_helper(op_name, supported=True):
     """The helper function to indicate that a given operator can be supported
     by DNNL.
 
-    Paramters
-    ---------
+    Parameters
+    ----------
     op_name : Str
         The name of operator that will be registered.
 
@@ -69,6 +70,10 @@ def _register_external_op_helper(op_name, supported=True):
 
     @tvm.ir.register_op_attr(op_name, "target.dnnl")
     def _func_wrapper(expr):
+        args = expr.args
+        if any([x.checked_type.dtype == "int64" for x in args]):
+            logger.info("DNNL does not support int64.")
+            return False
         return supported
 
     return _func_wrapper
@@ -90,6 +95,7 @@ def _func_wrapper(expr):
 _register_external_op_helper("exp")
 _register_external_op_helper("log")
 _register_external_op_helper("sqrt")
+_register_external_op_helper("round")
 _register_external_op_helper("nn.relu")
 _register_external_op_helper("nn.leaky_relu")
 _register_external_op_helper("tanh")
@@ -199,6 +205,70 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise):
     return dnnl_pattern
 
 
+def make_qnn_conv2d_pattern():
+    """Make qnn.conv2d based pattern supported by DNNL
+
+    Returns
+    -------
+    pattern : Tuple(pattern_name, CallPattern)
+        Created pattern name, along with its CallPattern.
+    """
+    data = wildcard()
+    weight = is_constant()
+    bias = is_constant()
+    o_scl = is_constant()
+    dst_zp = is_constant()
+    act_scl = is_constant()
+    sum_scl = is_constant()
+    sum_src = wildcard()
+
+    zero_zp = is_expr(const(0, dtype="int32"))
+
+    pat = is_op("qnn.conv2d")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
+    pat = is_op("cast")(pat)
+    pat = is_op("add")(pat, bias) | pat  # optional bias
+    pat = is_op("multiply")(pat, o_scl)
+    pat = is_op("clip")(pat)  # TBD, not only clip
+    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. Ex: act_scl == 1
+    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
+    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
+    pat = is_op("cast")(pat)
+
+    return "dnnl.qnn.conv2d", pat
+
+
+def make_qnn_dense_pattern():
+    """Make qnn.dense based pattern supported by DNNL
+
+    Returns
+    -------
+    pattern : Tuple(pattern_name, CallPattern)
+        Created pattern name, along with its CallPattern.
+    """
+    data = wildcard()
+    weight = is_constant()
+    bias = is_constant()
+    o_scl = is_constant()
+    dst_zp = is_constant()
+    act_scl = is_constant()
+    sum_scl = is_constant()
+    sum_src = wildcard()
+
+    zero_zp = is_expr(const(0, dtype="int32"))
+
+    pat = is_op("qnn.dense")(data, weight, zero_zp, zero_zp, is_constant(), is_constant())
+    pat = is_op("cast")(pat)
+    pat = is_op("add")(pat, bias) | pat  # optional bias
+    pat = is_op("multiply")(pat, o_scl)
+    pat = is_op("clip")(pat)  # TBD, not only clip
+    pat = is_op("multiply")(pat, act_scl) | pat  # optional multiply. ex act_scl == 1
+    pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat  # optional sum
+    pat = is_op("add")(pat, dst_zp) | pat  # optional dst_zp, can be dst_zp == 0
+    pat = is_op("cast")(pat)
+
+    return "dnnl.qnn.dense", pat
+
+
 @register_pattern_table("dnnl")
 def pattern_table():
     """Create dnnl patterns.
@@ -208,8 +278,11 @@ def pattern_table():
     dnnl_patterns : List[dnnl_pattern]
         Created patterns.
     """
+    dnnl_patterns = list()
+    dnnl_patterns.append(make_qnn_conv2d_pattern())
+    dnnl_patterns.append(make_qnn_dense_pattern())
+
     elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None]
-    dnnl_patterns = []
     for with_bias in [True, False]:
         for elt in elt_list:
             if not with_bias and not elt:
@@ -707,3 +780,201 @@ def rewrite_dense_bias_gelu_reshape_last(mod):
         [DenseReshapeBiasGeluRewrite(), DenseReshapeBiasGeluRewrite(has_gelu=False)], mod["main"]
     )
     return mod
+
+
+class LegalizeQnnOpForDnnl(DFPatternCallback):
+    """Legalize QNN based patterns to match DNNL
+
+    original pattern:
+      OP = qnn.dense | qnn.conv2d
+      %1 = OP<int>(SRC, WGH) - OP<int>(src_zp, WGH)   // qnn.conv2d
+      %2 = %1 + orig_bias                             // bias
+      %2 = (%1 - rq_in_zp) * rq_in_scl / rq_out_scl + rq_out_zp  // qnn.requantize
+      %3 = act(%2)                                               // activation == clip
+      %4 = ((%3 - sum_lh_zp) * sum_lh_scl + (SRC2 - sum_rh_zp) * sum_rh_scl)  // qnn.add
+           / sum_out_scl + sum_out_zp
+
+    transform to DNNL compatible:
+      %1 = OP<int>(SRC, WGH)
+      %2 = cast(%1, dtype="float")
+      %2 = (%1 + bias) * o_scl
+      %3 = act(%2) * act_scl
+      %4 = %3 + SRC2 * sum_scl
+      %5 = %4 + dst_zp
+      %6 = cast(%5, dtype="float")
+
+    where:
+      o_scl = rq_in_scl / rq_out_scl
+      act_scl = sum_lhs_scl / sum_out_scl
+      sum_scl = sum_rhs_scl / sum_out_scl
+      bias = orig_bias - OP(src_zp, WGH) - rq_in_zp + rq_out_zp * rq_out_scl / rq_in_scl
+      dst_zp = sum_out_zp - sum_lhs_zp * sum_lhs_scl / sum_out_scl -
+               sum_rhs_zp * sum_rhs_scl / sum_out_scl
+    """
+
+    def __init__(self):
+        super(LegalizeQnnOpForDnnl, self).__init__()
+        self.src = wildcard()
+        self.wgh = wildcard()
+        self.bias = wildcard()
+        self.sum_src = wildcard()
+
+        self.src_scl = is_constant()
+        self.src_zp = is_constant()
+        self.wgh_scl = is_constant()
+        self.wgh_zp = is_expr(const(0))
+
+        self.rq_in_scl = is_constant()
+        self.rq_in_zp = is_constant()
+        self.rq_out_scl = is_constant()
+        self.rq_out_zp = is_constant()
+
+        self.sum_lhs_scl = is_constant()
+        self.sum_lhs_zp = is_constant()
+        self.sum_rhs_scl = is_constant()
+        self.sum_rhs_zp = is_constant()
+        self.sum_out_scl = is_constant()
+        self.sum_out_zp = is_constant()
+
+        self.root = (is_op("qnn.conv2d") | is_op("qnn.dense"))(
+            self.src, self.wgh, self.src_zp, self.wgh_zp, self.src_scl, self.wgh_scl
+        )
+        pat = is_op("add")(self.root, self.bias) | self.root  # optional bias
+        pat = is_op("qnn.requantize")(
+            pat, self.rq_in_scl, self.rq_in_zp, self.rq_out_scl, self.rq_out_zp
+        )
+        pat = is_op("clip")(pat)
+        cast = is_op("cast")(pat)
+        pat = is_op("qnn.add")(
+            cast,
+            self.sum_src,
+            self.sum_lhs_scl,
+            self.sum_lhs_zp,
+            self.sum_rhs_scl,
+            self.sum_rhs_zp,
+            self.sum_out_scl,
+            self.sum_out_zp,
+        )
+        pat = is_op("clip")(pat)
+        self.pattern = pat | cast
+
+    def callback(self, pre, post, node_map):
+        root = node_map[self.root][0]
+        src = node_map[self.src][0]
+        wgh = node_map[self.wgh][0]
+        bias = node_map.get(self.bias, default=[relay.const(0, dtype="int32")])[0]
+        src_zp = node_map[self.src_zp][0]
+        rq_in_scl = node_map[self.rq_in_scl][0]
+        rq_in_zp = node_map[self.rq_in_zp][0]
+        rq_out_scl = node_map[self.rq_out_scl][0]
+        rq_out_zp = node_map[self.rq_out_zp][0]
+
+        final_dtype = node_map[self.pattern][0].checked_type.dtype
+
+        if root.op == relay.op.get("qnn.conv2d"):
+            dst_layout = root.attrs.out_layout
+            dst_layout = root.attrs.data_layout if dst_layout == "" else dst_layout
+            wgh_layout = root.attrs.kernel_layout
+        else:
+            # qnn.dense has no layout attributes. Assume that is plain
+            dst_layout = "NC"
+            wgh_layout = "OI"
+
+        # TODO(@apeskov): dst_layout may ne blocked
+        bias_rank = len(dst_layout) - dst_layout.index("C")
+
+        sum_src = node_map[self.sum_src][0] if self.sum_src in node_map else None
+        # Default values if qnn.sum is not present
+        sum_lhs_scl = node_map[self.sum_lhs_scl][0] if sum_src else relay.const(1, dtype="float32")
+        sum_lhs_zp = node_map[self.sum_lhs_zp][0] if sum_src else relay.const(0, dtype="int32")
+        sum_rhs_scl = node_map[self.sum_rhs_scl][0] if sum_src else relay.const(0, dtype="float32")
+        sum_rhs_zp = node_map[self.sum_rhs_zp][0] if sum_src else relay.const(0, dtype="int32")
+        sum_out_scl = node_map[self.sum_out_scl][0] if sum_src else relay.const(1, dtype="float32")
+        sum_out_zp = node_map[self.sum_out_zp][0] if sum_src else relay.const(0, dtype="int32")
+
+        def cast_fp(op):
+            return relay.op.cast(op, dtype="float32")
+
+        # recalculate some factors
+        o_scl = rq_in_scl / rq_out_scl
+        act_scl = sum_lhs_scl / sum_out_scl
+        sum_scl = sum_rhs_scl / sum_out_scl
+        dst_zp = (
+            cast_fp(sum_out_zp)
+            - cast_fp(sum_lhs_zp) * sum_lhs_scl / sum_out_scl
+            - cast_fp(sum_rhs_zp) * sum_rhs_scl / sum_out_scl
+        )
+        bias = self.squeeze_bias(bias, dst_layout)
+        bias = (
+            cast_fp(bias)
+            - cast_fp(self.fake_op(src_zp, wgh, wgh_layout))
+            - cast_fp(rq_in_zp)
+            + cast_fp(rq_out_zp) * rq_out_scl / rq_in_scl
+        )
+        bias = self.broadcast_to_rank(bias, bias_rank)
+
+        zero_zp = relay.const(0, dtype="int32")
+        one_scl = relay.const(1.0, dtype="float32")
+
+        # construct new graph with proper post op ordering
+        gr = tvm.relay.Call(
+            root.op,
+            [src, wgh, zero_zp, zero_zp, one_scl, one_scl],
+            root.attrs,
+            root.type_args,
+            root.span,
+        )
+        gr = relay.op.cast(gr, dtype="float32")
+        gr = gr + bias
+        gr = gr * o_scl
+        gr = relay.op.clip(gr, 0, 255) * act_scl
+        gr = gr + sum_scl * cast_fp(sum_src) if sum_src else gr
+        gr = gr + dst_zp
+        gr = relay.op.cast(gr, dtype=final_dtype)
+        return gr
+
+    @staticmethod
+    def fake_op(zp, wgh, layout):
+        """Fake operator implementation for zp broadcast input"""
+        # Conv:  reduce kernel {OC, IC, KH, KW} -> {OC} in case of group that is still correct
+        # Dense: reduce kernel {OC, IC} -> {OC}
+        wgh_int = relay.op.cast(wgh, dtype="int32")
+        reduced_kernel = relay.op.sum(
+            wgh_int, axis=[layout.index("O")], keepdims=False, exclude=True
+        )
+        return zp * reduced_kernel
+
+    @staticmethod
+    def squeeze_bias(bias, layout):
+        shape = transform.InferTypeLocal(bias).concrete_shape
+        c_position = layout.index("C") - len(layout) + len(shape)
+        squeeze_idxs = [i for i in range(len(shape)) if i != c_position]
+        return relay.op.squeeze(bias, squeeze_idxs)
+
+    @staticmethod
+    def broadcast_to_rank(op, rank):
+        """Scalar or 1D tensor are supported"""
+        shape = transform.InferTypeLocal(op).concrete_shape
+        if len(shape) == 0:
+            return op
+        if len(shape) == 1:
+            return relay.op.expand_dims(op, 1, rank - 1)
+        raise ValueError("Unexpected bias rank to broadcast. Only 0 and 1 are supported.")
+
+
+def legalize_qnn_for_dnnl(mod):
+    """Transform qnn primitives to DNNL compatible form. Eliminate source zero point and apply
+    strict sequence of post ops."""
+    mod["main"] = rewrite(LegalizeQnnOpForDnnl(), mod["main"])
+
+    seq = tvm.transform.Sequential(
+        [
+            transform.InferType(),
+            # transform.SimplifyInference(),  # TODO: this pass decompose nn.layer_norm
+            # transform.FoldScaleAxis(),  # TODO: fail inside TVM in case of grouped convolutions.
+            transform.FoldConstant(),
+        ]
+    )
+    with tvm.transform.PassContext(opt_level=3):
+        mod = seq(mod)
+    return mod
diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
index 9d3a3aff21659..16790ca2c783c 100644
--- a/python/tvm/runtime/ndarray.py
+++ b/python/tvm/runtime/ndarray.py
@@ -236,18 +236,21 @@ def numpy(self):
             return np_arr_ret.reshape(shape)
         return np_arr
 
-    def copyto(self, target):
+    def copyto(self, target, mem_scope=None):
         """Copy array to target
 
         Parameters
         ----------
         target : NDArray
             The target array to be copied, must have same shape as this array.
+
+        mem_scope : Optional[str]
+            The memory scope of the array.
         """
         if isinstance(target, NDArrayBase):
             return self._copyto(target)
         if isinstance(target, Device):
-            res = empty(self.shape, self.dtype, target)
+            res = empty(self.shape, self.dtype, target, mem_scope)
             return self._copyto(res)
         raise ValueError("Unsupported target type %s" % str(type(target)))
 
@@ -574,7 +577,7 @@ def webgpu(dev_id=0):
 mtl = metal
 
 
-def array(arr, device=cpu(0)):
+def array(arr, device=cpu(0), mem_scope=None):
     """Create an array from source arr.
 
     Parameters
@@ -585,6 +588,9 @@ def array(arr, device=cpu(0)):
     device : Device, optional
         The device device to create the array
 
+    mem_scope : Optional[str]
+        The memory scope of the array
+
     Returns
     -------
     ret : NDArray
@@ -595,7 +601,7 @@ def array(arr, device=cpu(0)):
 
     if not isinstance(arr, (np.ndarray, NDArray)):
         arr = np.array(arr)
-    return empty(arr.shape, arr.dtype, device).copyfrom(arr)
+    return empty(arr.shape, arr.dtype, device, mem_scope).copyfrom(arr)
 
 
 # Register back to FFI
diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc
index 9905ff73c792c..5adff49984946 100644
--- a/src/meta_schedule/database/database.cc
+++ b/src/meta_schedule/database/database.cc
@@ -85,6 +85,19 @@ TuningRecord::TuningRecord(tir::Trace trace, Workload workload, Optional<Array<F
   this->data_ = n;
 }
 
+MeasureCandidate TuningRecordNode::AsMeasureCandidate() const {
+  tir::Schedule sch =
+      tir::Schedule::Traced(workload->mod, -1, 0, tir::ScheduleErrorRenderLevel::kDetail);
+  trace->ApplyToSchedule(sch, false, nullptr);
+  tir::PrimFunc func;
+  for (const auto& kv : sch->mod()->functions) {
+    func = Downcast<tir::PrimFunc>(kv.second);
+  }
+  Array<ArgInfo> args_info = ArgInfo::FromPrimFunc(func);
+  MeasureCandidate candidate = MeasureCandidate(sch, args_info);
+  return candidate;
+}
+
 ObjectRef TuningRecordNode::AsJSON() const {
   Optional<Array<ObjectRef>> json_args_info{nullptr};
   Optional<ObjectRef> json_target{nullptr};
@@ -152,12 +165,15 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w
 Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload,
                               PyDatabaseNode::FCommitWorkload f_commit_workload,
                               PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record,
-                              PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FSize f_size) {
+                              PyDatabaseNode::FGetTopK f_get_top_k,
+                              PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records,
+                              PyDatabaseNode::FSize f_size) {
   ObjectPtr<PyDatabaseNode> n = make_object<PyDatabaseNode>();
   n->f_has_workload = f_has_workload;
   n->f_commit_workload = f_commit_workload;
   n->f_commit_tuning_record = f_commit_tuning_record;
   n->f_get_top_k = f_get_top_k;
+  n->f_get_all_tuning_records = f_get_all_tuning_records;
   n->f_size = f_size;
   return Database(n);
 }
@@ -179,6 +195,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuningRecord")
                        Optional<Target> target, Optional<Array<ArgInfo>> args_info) {
       return TuningRecord(trace, workload, run_secs, target, args_info);
     });
+TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsMeasureCandidate")
+    .set_body_method<TuningRecord>(&TuningRecordNode::AsMeasureCandidate);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON")
     .set_body_method<TuningRecord>(&TuningRecordNode::AsJSON);
 TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordFromJSON").set_body_typed(TuningRecord::FromJSON);
@@ -190,6 +208,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitTuningRecord")
     .set_body_method<Database>(&DatabaseNode::CommitTuningRecord);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetTopK")
     .set_body_method<Database>(&DatabaseNode::GetTopK);
+TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetAllTuningRecords")
+    .set_body_method<Database>(&DatabaseNode::GetAllTuningRecords);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabaseSize").set_body_method<Database>(&DatabaseNode::Size);
 TVM_REGISTER_GLOBAL("meta_schedule.DatabasePyDatabase").set_body_typed(Database::PyDatabase);
 
diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc
index 4f5bd9b136131..9bb7ee1027b99 100644
--- a/src/meta_schedule/database/json_database.cc
+++ b/src/meta_schedule/database/json_database.cc
@@ -156,6 +156,15 @@ class JSONDatabaseNode : public DatabaseNode {
     return results;
   }
 
+  Array<TuningRecord> GetAllTuningRecords() {
+    Array<TuningRecord> results;
+    results.reserve(Size());
+    for (const TuningRecord& record : this->tuning_records_) {
+      results.push_back(record);
+    }
+    return results;
+  }
+
   int64_t Size() { return tuning_records_.size(); }
 };
 
diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc
index 0c70dcf5c406f..57b2344c6f8db 100644
--- a/src/meta_schedule/tune_context.cc
+++ b/src/meta_schedule/tune_context.cc
@@ -142,7 +142,9 @@ Array<RunnerResult> TuneContextNode::_Join() {
       results.push_back(future->Result());
     }
   }
-  this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  if (this->search_strategy.defined()) {
+    this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results);
+  }
   ICHECK(this->measure_candidates.defined());
   ICHECK(this->builder_results.defined());
   ICHECK_EQ(results.size(), this->measure_candidates.value().size());
@@ -177,6 +179,16 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext")
 TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex);
 TVM_REGISTER_GLOBAL("meta_schedule.TuneContextInitialize")
     .set_body_method<TuneContext>(&TuneContextNode::Initialize);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSetMeasureCandidates")
+    .set_body_method<TuneContext>(&TuneContextNode::_SetMeasureCandidates);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToBuilder")
+    .set_body_method<TuneContext>(&TuneContextNode::_SendToBuilder);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToRunner")
+    .set_body_method<TuneContext>(&TuneContextNode::_SendToRunner);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextJoin")
+    .set_body_method<TuneContext>(&TuneContextNode::_Join);
+TVM_REGISTER_GLOBAL("meta_schedule.TuneContextClearMeasureState")
+    .set_body_method<TuneContext>(&TuneContextNode::_ClearMeasureState);
 
 }  // namespace meta_schedule
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/cmsisnn/extract_constants.cc b/src/relay/backend/contrib/cmsisnn/extract_constants.cc
index 1cbe36e30f765..c6ed7af9ff031 100644
--- a/src/relay/backend/contrib/cmsisnn/extract_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/extract_constants.cc
@@ -164,7 +164,18 @@ class ExtractConstantsMutator : public MixedModeMutator {
           function_signature.push_back(arg);
         } else {
           if (arg.as<VarNode>()) {
-            function_signature.push_back(arg);
+            // Only push if its not already present as multiple consumers of any input var
+            // will appear only once in the function signature.
+            bool found_in_existing_signature = false;
+            for (auto& sign : function_signature) {
+              if (arg.same_as(sign)) {
+                found_in_existing_signature = true;
+                break;
+              }
+            }
+            if (!found_in_existing_signature) {
+              function_signature.push_back(arg);
+            }
           }
           new_args.push_back(arg);
         }
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 524735caa9d6a..5c99061fa854b 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -556,7 +556,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     BufferCreator buffer_creator;
     tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8));
-    tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    tir::Var input_1;
+    if (mul_call->args[0].same_as(mul_call->args[1])) {
+      input_1 = input_0;
+    } else {
+      input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    }
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
     tvm::Array<PrimExpr> args = {
@@ -626,7 +631,12 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     BufferCreator buffer_creator;
     tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8));
-    tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    tir::Var input_1;
+    if (add_call->args[0].same_as(add_call->args[1])) {
+      input_1 = input_0;
+    } else {
+      input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8));
+    }
     tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
 
     tvm::Array<PrimExpr> args = {
diff --git a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
index 2448bfc766306..40fd773eb2092 100644
--- a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
+++ b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc
@@ -179,6 +179,12 @@ class ScalarToTensorConstantMutator : public MixedModeMutator {
     auto new_body = VisitExpr(func->body);
     Function new_func = WithFields(func, FreeVars(new_body), new_body, func->ret_type,
                                    FreeTypeVars(new_body, mod_), func->attrs);
+
+    // Updating new_func parameters could result into uniquification of function parameters.
+    // Call arguments need to be aligned to the number of arguments expected by new_func.
+    if (new_args[0].same_as(new_args[1])) {
+      new_args.erase(new_args.begin());
+    }
     return Call(new_func, new_args);
   }
 
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index 927cd12ae0fb9..f17cdafa76a5f 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -35,6 +35,7 @@
 #include <sstream>
 
 #include "../../utils.h"
+#include "comp_op_matcher.h"
 
 #ifdef USE_JSON_RUNTIME
 #include "../../../../runtime/contrib/json/json_node.h"
@@ -436,6 +437,30 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase {
 
 #else  // DNNL JSON runtime
 
+/*!
+ * \brief Replace var expr which bind with args of call node
+ *
+ * \param args vector of expression (contains vars or constant nodes)
+ * \param cn call node which describe mapping of internal body vars with args
+ * \return updated vector of expressions
+ */
+static tvm::Array<Expr> BindToCallNodeArgs(const std::vector<Expr>& args, const CallNode* cn) {
+  tvm::Array<Expr> res;
+  for (const auto& arg : args) {
+    if (arg->IsInstance<ConstantNode>()) {
+      res.push_back(arg);
+    } else {
+      auto body_params = cn->op.as<FunctionNode>()->params;
+      auto found = std::find(body_params.begin(), body_params.end(), arg);
+      ICHECK(found != body_params.end());
+      auto idx = std::distance(body_params.begin(), found);
+      res.push_back(cn->args[idx]);
+    }
+  }
+  return res;
+}
+
+/*! \brief Serializer to DNNL JSON runtime module */
 class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
   using JSONGraphNode = tvm::runtime::json::JSONGraphNode;
   using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry;
@@ -475,14 +500,19 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
   }
 
  public:
-  DNNLJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {}
+  DNNLJSONSerializer(const std::string& symbol, const Expr& expr)
+      : JSONSerializer("dnnl_" + symbol, expr) {}
 
   std::vector<JSONGraphNodeEntry> VisitExpr_(const CallNode* cn) override {
     Expr expr = GetRef<Expr>(cn);
     std::string name;
+    tvm::Array<Expr> args;
+    std::unordered_map<std::string, dmlc::any> extra_attrs;
+
     const CallNode* call = cn;
     if (const auto* op_node = cn->op.as<OpNode>()) {
       name = op_node->name;
+      args = cn->args;
     } else if (const auto* fn = cn->op.as<FunctionNode>()) {
       auto comp = fn->GetAttr<String>(attr::kComposite);
       ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
@@ -511,15 +541,24 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
       } else if (name.find("dnnl.dense") != std::string::npos) {
         call = GetRootCall(fn->body.as<CallNode>(), 10, "nn.dense");
         ICHECK(call->op.as<OpNode>()) << "Not op node";
+      } else if (name.find("dnnl.qnn.conv2d") != std::string::npos ||
+                 name.find("dnnl.qnn.dense") != std::string::npos) {
+        std::vector<Expr> args_loc;
+        call = ParseComposite(*fn, &extra_attrs, &args_loc);
+        args = BindToCallNodeArgs(args_loc, cn);
       } else {
         LOG(FATAL) << "Unrecognized DNNL pattern: " << name;
       }
+
+      if (args.empty()) {
+        args = cn->args;
+      }
     } else {
       LOG(FATAL) << "DNNL JSON runtime does not support calls to " << cn->op->GetTypeKey();
     }
 
     std::vector<JSONGraphNodeEntry> inputs;
-    for (const auto& arg : cn->args) {
+    for (const auto& arg : args) {
       auto res = VisitExpr(arg);
       inputs.insert(inputs.end(), res.begin(), res.end());
     }
@@ -527,6 +566,8 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer {
                                                 "kernel", /* op_type_ */
                                                 inputs, 1 /* num_outputs_ */);
     SetCallNodeAttribute(node, call);
+    for (const auto& kvp : extra_attrs) node->SetAttr(kvp.first, kvp.second);
+
     return AddNode(node, GetRef<Expr>(cn));
   }
 };
@@ -558,6 +599,61 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) {
 
 TVM_REGISTER_GLOBAL("relay.ext.dnnl").set_body_typed(DNNLCompiler);
 
+/*!
+ * \brief Constant Updater for DNNL JSON runtime
+ *
+ * Not all originally existing ConstantNode should be passed to JSON runtime.
+ * Some of them may be skipped or change ordering. So we have to apply the same traversing through
+ * the graph as DNNLJSONSerializer.
+ */
+struct DNNLConstantUpdater : public ConstantUpdater {
+ public:
+  DNNLConstantUpdater(const std::string& symbol,
+                      std::unordered_map<std::string, runtime::NDArray>* params)
+      : ConstantUpdater("dnnl_" + symbol, params) {}
+  using ConstantUpdater::VisitExpr_;
+
+  void VisitExpr_(const CallNode* cn) final {
+    this->VisitSpan(cn->span);
+
+    if (const auto* fn = cn->op.as<FunctionNode>()) {
+      std::vector<Expr> args_loc;
+      std::unordered_map<std::string, dmlc::any> attrs;
+      auto root_cn = ParseComposite(*fn, &attrs, &args_loc);
+
+      auto args = root_cn ? BindToCallNodeArgs(args_loc, cn) : cn->args;
+
+      // Customized visit order of args
+      for (const auto& arg : args) {
+        this->VisitExpr(arg);
+      }
+    } else {
+      // Original visit order of args
+      for (auto arg : cn->args) {
+        this->VisitExpr(arg);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief The external compiler/codegen tool. It takes a Relay expression/module and
+ * produce collection of required constant NDArrays.
+ */
+Map<String, runtime::NDArray> DNNLConstantUpdaterFunc(Expr expr, std::string symbol) {
+  // Visit all suitable constant nodes
+  std::unordered_map<std::string, runtime::NDArray> res;
+  DNNLConstantUpdater const_updater(symbol, &res);
+  const_updater(expr);
+
+  // Convert to tvm::Map
+  Map<String, runtime::NDArray> ret;
+  for (const auto& kvp : res) ret.Set(kvp.first, kvp.second);
+  return ret;
+}
+
+TVM_REGISTER_GLOBAL("relay.ext.dnnl.constant_updater").set_body_typed(DNNLConstantUpdaterFunc);
+
 }  // namespace contrib
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/contrib/dnnl/comp_op_matcher.h b/src/relay/backend/contrib/dnnl/comp_op_matcher.h
new file mode 100644
index 0000000000000..364cc6e377ca8
--- /dev/null
+++ b/src/relay/backend/contrib/dnnl/comp_op_matcher.h
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/dnnl/comp_op_matcher.h
+ * \brief Implement matcher based function to parse complex composite nodes.
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
+#define TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
+
+#include <tvm/relay/function.h>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "../../../ir/dataflow_matcher_impl.h"
+
+/*!
+ * \brief Converter value to dmlc attr acceptable format
+ *
+ * \tparam T type of value (auto deduction)
+ * \param val value to convert
+ * \return resulting dmlc object
+ */
+template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(std::vector<std::string>{std::to_string(val)});
+  return dmlc::any{attr};
+}
+
+template <typename T, std::enable_if_t<std::is_same<T, std::string>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(std::vector<std::string>{val});
+  return dmlc::any{attr};
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same<T, std::vector<std::string>>::value, bool> = true>
+dmlc::any dmlc_attr(const T& val) {
+  std::vector<dmlc::any> attr;
+  attr.emplace_back(val);
+  return dmlc::any{attr};
+}
+
+/*! \brief Constructor of const scalar expression with defined type */
+tvm::relay::Expr constant(float val) {
+  auto value = tvm::runtime::NDArray::Empty({}, tvm::DataType::Float(32), {kDLCPU, 0});
+  value.CopyFromBytes(&val, sizeof(val));
+  auto res = tvm::relay::Constant(value);
+  tvm::relay::transform::InferTypeLocal(res);
+  return res;
+}
+
+/*!
+ * \brief Simple helper to accumulate composite function arguments and corresponding attributes
+ * with indexes of them.
+ */
+class ArgPacker {
+ public:
+  ArgPacker(std::unordered_map<std::string, dmlc::any>* attrs, std::vector<tvm::relay::Expr>* args)
+      : attrs_(attrs), args_(args) {}
+
+  int Put(const tvm::relay::Expr& arg, std::string tag_name = "") {
+    if (!arg.defined()) return -1;
+    int idx = args_->size();
+    args_->push_back(arg);
+    if (!tag_name.empty()) {
+      attrs_->operator[](tag_name) = dmlc_attr(idx);
+    }
+    return idx;
+  }
+
+ private:
+  std::unordered_map<std::string, dmlc::any>* attrs_;
+  std::vector<tvm::relay::Expr>* args_;
+};
+
+const tvm::relay::CallNode* ParseQnnConvComp(const tvm::relay::FunctionNode& comp_fn,
+                                             std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                             std::vector<tvm::relay::Expr>* args) {
+  using namespace tvm::relay;
+
+  // Pattern
+  auto src = IsWildcard();
+  auto wgh = IsWildcard();
+  auto sum_src = IsWildcard();
+  auto bias = IsConstant();
+
+  auto o_scl = IsConstant();
+  auto act_scl = IsConstant();
+  auto sum_scl = IsConstant();
+  auto dst_zp = IsConstant();
+
+  DFPattern cnv;
+  DFPattern pat;
+
+  cnv = IsOp("qnn.conv2d")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()});
+  pat = IsOp("cast")({cnv});
+  pat = IsOp("add")({pat, bias}) || pat;
+  pat = IsOp("multiply")({pat, o_scl});
+  pat = IsOp("clip")({pat});
+  pat = IsOp("multiply")({pat, act_scl}) || pat;
+  pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat;
+  pat = IsOp("add")({pat, dst_zp}) || pat;
+  pat = IsOp("cast")({pat});
+
+  // Check pattern match
+  auto indexed_body = CreateIndexedGraph(comp_fn.body);
+  DFPatternMatcher matcher(indexed_body.get());
+  auto res = matcher.Match(pat, comp_fn.body);
+  ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic";
+
+  // Handle arguments in deterministic order
+  auto map = matcher.GetMemo();
+  auto find = [&map](const DFPattern& pat) -> tvm::relay::Expr {
+    if (map.count(pat)) return map.at(pat)[0];
+    return {};
+  };
+
+  ArgPacker arg_holder(ext_attrs, args);
+  arg_holder.Put(find(src));
+  arg_holder.Put(find(wgh));
+  arg_holder.Put(find(bias), "bias_idx");
+  arg_holder.Put(find(sum_src), "sum_idx");
+  arg_holder.Put(find(o_scl), "o_scl_idx");
+  arg_holder.Put(find(act_scl), "act_scl_idx");
+  arg_holder.Put(find(sum_scl), "sum_scl_idx");
+  arg_holder.Put(find(dst_zp), "dst_zp_idx");
+
+  // Activation. Default clip to simulate relu via uint8 cast
+  std::vector<std::string> clip_attr{"clip"};
+  auto act_scl_val = map.count(act_scl) ? find(act_scl) : constant(1.0);
+  clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val)));      // act_scale
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0))));    // alpha
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0))));  // beta
+  (*ext_attrs)["activation"] = dmlc_attr(clip_attr);
+
+  return map.at(cnv)[0].as<CallNode>();
+}
+
+const tvm::relay::CallNode* ParseQnnDenseComp(const tvm::relay::FunctionNode& comp_fn,
+                                              std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                              std::vector<tvm::relay::Expr>* args) {
+  using namespace tvm::relay;
+
+  // Pattern
+  auto src = IsWildcard();
+  auto wgh = IsWildcard();
+  auto sum_src = IsWildcard();
+  auto bias = IsConstant();
+
+  auto o_scl = IsConstant();
+  auto act_scl = IsConstant();
+  auto sum_scl = IsConstant();
+  auto dst_zp = IsConstant();
+
+  DFPattern dns, act, pat;
+
+  dns = IsOp("qnn.dense")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()});
+  pat = IsOp("cast")({dns});
+  pat = IsOp("add")({pat, bias}) || pat;
+  pat = IsOp("multiply")({pat, o_scl});
+  pat = IsOp("clip")({pat});
+  pat = IsOp("multiply")({pat, act_scl}) || pat;
+  pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat;
+  pat = IsOp("add")({pat, dst_zp}) || pat;
+  pat = IsOp("cast")({pat});
+
+  // Check pattern match
+  auto indexed_body = CreateIndexedGraph(comp_fn.body);
+  DFPatternMatcher matcher(indexed_body.get());
+  auto res = matcher.Match(pat, comp_fn.body);
+  ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic";
+
+  // Handle arguments in deterministic order
+  auto memo = matcher.GetMemo();
+  auto find = [&memo](const DFPattern& pat) -> tvm::relay::Expr {
+    if (memo.count(pat)) return memo.at(pat)[0];
+    return {};
+  };
+
+  ArgPacker arg_holder(ext_attrs, args);
+  arg_holder.Put(find(src));
+  arg_holder.Put(find(wgh));
+  arg_holder.Put(find(bias), "bias_idx");
+  arg_holder.Put(find(sum_src), "sum_idx");
+  arg_holder.Put(find(o_scl), "o_scl_idx");
+  arg_holder.Put(find(act_scl), "act_scl_idx");
+  arg_holder.Put(find(sum_scl), "sum_scl_idx");
+  arg_holder.Put(find(dst_zp), "dst_zp_idx");
+
+  // Activation. Default clip to simulate relu via uint8 cast
+  std::vector<std::string> clip_attr{"clip"};
+  auto act_scl_val = memo.count(act_scl) ? find(act_scl) : constant(1.0);
+  clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val)));      // act_scale
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0))));    // alpha
+  clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0))));  // beta
+  (*ext_attrs)["activation"] = dmlc_attr(clip_attr);
+
+  return memo.at(dns)[0].as<CallNode>();
+}
+
+/*!
+ * Parse composite function and return real args, additional attributes and root call node
+ * @param comp_fn composite function to parse
+ * @param ext_attrs attr collection with additional attributes
+ * @param args real arguments of node
+ * @return root call node
+ */
+const tvm::relay::CallNode* ParseComposite(const tvm::relay::FunctionNode& comp_fn,
+                                           std::unordered_map<std::string, dmlc::any>* ext_attrs,
+                                           std::vector<tvm::relay::Expr>* args) {
+  auto comp = comp_fn.GetAttr<tvm::String>(tvm::relay::attr::kComposite);
+  ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions.";
+  auto name = comp.value();
+
+  const tvm::relay::CallNode* res = nullptr;
+  if (name == "dnnl.qnn.conv2d")
+    res = ParseQnnConvComp(comp_fn, ext_attrs, args);
+  else if (name == "dnnl.qnn.dense")
+    res = ParseQnnDenseComp(comp_fn, ext_attrs, args);
+  return res;
+}
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 5b23843c95e62..8715900c0c4a1 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -193,7 +193,9 @@ class LowerToTECompute : public backend::MemoizedExprTranslator<Array<te::Tensor
     } else {
       const auto* ttype = op->checked_type().as<TensorTypeNode>();
       std::stringstream ss;
-      ss << "constant_" << const_index++;
+      std::string s = readable_name_stream_.str();
+      std::replace(s.begin(), s.end(), '.', '_');
+      ss << s << "_constant_" << const_index++;
       tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, ss.str());
       constant_tensors_[op] = tensor;
       return {tensor};
diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
index 5045f3323af7c..a4239186b4b33 100644
--- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
+++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc
@@ -134,9 +134,56 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       {"tanh", dnnl::algorithm::eltwise_tanh},
       {"sigmoid", dnnl::algorithm::eltwise_logistic},
       {"clip", dnnl::algorithm::eltwise_clip},
+      {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf},
   };
 
-  bool ParsingOpName(const std::string op_name, dnnl::primitive_attr attr) {
+  dnnl::primitive_attr ParseAttrs(const size_t& nid, TensorRequisite* bias_tr) {
+    dnnl::primitive_attr attr;
+
+    // Post op attributes based on named inputs.
+    auto dst_zp_tr = GetInputByName(nid, "dst_zp_idx");
+    auto o_scl_tr = GetInputByName(nid, "o_scl_idx");
+    auto sum_scl_tr = GetInputByName(nid, "sum_scl_idx");
+
+    if (o_scl_tr) {
+      ICHECK(o_scl_tr.IsConstant());
+      auto data = o_scl_tr.GetConstDataLikeVec<float>();
+      attr.set_output_scales(data.size() == 1 ? 0 : (1 << 1), data);
+    }
+
+    auto activation = GetNodeAttr<std::vector<std::string>>(nodes_[nid], "activation", {"none"});
+    if (activation[0] != "none") {
+      auto a_type = elt_name2algo.at(activation[0]);
+      auto a_scale = GetInput(nid, std::stoi(activation[1])).GetConstScalarData<float>();
+      auto a_alfa = GetInput(nid, std::stoi(activation[2])).GetConstScalarData<float>();
+      auto a_beta = GetInput(nid, std::stoi(activation[3])).GetConstScalarData<float>();
+
+      auto ops = attr.get_post_ops();
+      ops.append_eltwise(a_scale, a_type, a_alfa, a_beta);
+      attr.set_post_ops(ops);
+    }
+
+    if (sum_scl_tr) {
+      auto scl = sum_scl_tr.GetConstScalarData<float>();
+      auto ops = attr.get_post_ops();
+      ops.append_sum(scl);
+      attr.set_post_ops(ops);
+    }
+
+    if (dst_zp_tr) {
+      auto zp = dst_zp_tr.GetConstScalarData<float>();
+      // Use linear post op instead of set_zero_points(). Because of limitation of int32 type,
+      // but we have to use float.
+      auto ops = attr.get_post_ops();
+      ops.append_eltwise(1.0, dnnl::algorithm::eltwise_linear, 1.0, zp);
+      attr.set_post_ops(ops);
+    }
+    *bias_tr = GetInputByName(nid, "bias_idx");
+
+    if (o_scl_tr || activation[0] != "none" || sum_scl_tr || dst_zp_tr) return attr;
+
+    // parsing of name to extract attributes
+    auto op_name = nodes_[nid].GetOpName();
     // Define RegExp.
     std::regex bias_add_pat(".*_bias.*");
     std::regex relu_pat(".*_relu.*");
@@ -163,7 +210,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     }
 
     // Parsing bias_add.
-    return std::regex_match(op_name, bias_add_pat) ? true : false;
+    *bias_tr = std::regex_match(op_name, bias_add_pat) ? GetInput(nid, 2) : TensorRequisite{};
+
+    return attr;
   }
 
   // Build up the engine based on the input graph.
@@ -219,16 +268,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Convolution(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+
     auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
     auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
     auto padding = GetNodeAttr<std::vector<int64_t>>(node, "padding");
@@ -292,25 +341,29 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto scratchpad_tr = TensorRequisite::AsIs(conv_prim_desc.scratchpad_desc());
 
-    Submit(dnnl::convolution_forward(conv_prim_desc), {{DNNL_ARG_SRC, src_tr},
-                                                       {DNNL_ARG_WEIGHTS, wgh_tr},
-                                                       {DNNL_ARG_BIAS, bias_tr},
-                                                       {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
-                                                       {DNNL_ARG_DST, dst_tr}});
+    // TODO(@apeskov): Simulation of inplace primitive. just as PoC.
+    auto sum_in_tr = GetInputByName(nid, "sum_idx").TreatAs(dst_layout);
+
+    Submit(dnnl::convolution_forward(conv_prim_desc),
+           {{DNNL_ARG_SRC, src_tr},
+            {DNNL_ARG_WEIGHTS, wgh_tr},
+            {DNNL_ARG_BIAS, bias_tr},
+            {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+            {DNNL_ARG_DST, dst_tr}},
+           {sum_in_tr, DNNL_ARG_DST});
   }
 
   void Deconvolution(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     auto strides = GetNodeAttr<std::vector<int64_t>>(node, "strides");
     auto dilates = GetNodeAttr<std::vector<int64_t>>(node, "dilation");
@@ -374,16 +427,15 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
   void Dense(const size_t& nid) {
     auto node = nodes_[nid];
-    auto op_name = node.GetOpName();
-    dnnl::primitive_attr attr;
-    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    bool has_bias = ParsingOpName(op_name, attr);
 
     // Setup attributes.
     auto src_tr = GetInput(nid, 0);
     auto wgh_tr = GetInput(nid, 1);
     auto dst_tr = GetOutput(nid, 0);
-    auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1);
+    auto bias_tr = TensorRequisite{};
+
+    auto attr = ParseAttrs(nid, &bias_tr);
+    attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     // Assumption that bias is correct and can be squeezed to 1D
     bias_tr = bias_tr.Reshape({dst_tr.dims()[1]});
@@ -403,11 +455,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
 
     auto scratchpad_tr = TensorRequisite::AsIs(dense_prim_desc.scratchpad_desc());
 
-    Submit(dnnl::inner_product_forward(dense_prim_desc), {{DNNL_ARG_SRC, src_tr},
-                                                          {DNNL_ARG_WEIGHTS, wgh_tr},
-                                                          {DNNL_ARG_BIAS, bias_tr},
-                                                          {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
-                                                          {DNNL_ARG_DST, dst_tr}});
+    // TODO(@apeskov): Simulation of inplace primitive. just as PoC.
+    auto sum_in_tr = GetInputByName(nid, "sum_idx");
+
+    Submit(dnnl::inner_product_forward(dense_prim_desc),
+           {{DNNL_ARG_SRC, src_tr},
+            {DNNL_ARG_WEIGHTS, wgh_tr},
+            {DNNL_ARG_BIAS, bias_tr},
+            {DNNL_ARG_SCRATCHPAD, scratchpad_tr},
+            {DNNL_ARG_DST, dst_tr}},
+           {sum_in_tr, DNNL_ARG_DST});
   }
 
   void BatchNorm(const size_t& nid) {
@@ -675,6 +732,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
     return res;
   }
 
+  TensorRequisite GetInputByName(const size_t& nid, const std::string& name) {
+    auto idx = GetNodeAttr<int>(nodes_[nid], name, {"-1"});
+    return GetInput(nid, idx);
+  }
+
   TensorRequisite GetOutput(const size_t& nid, const int idx) {
     if (idx == -1) return {};  // -1 reserved value for empty input.
 
@@ -692,8 +754,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
   }
 
   /*! \brief Helper function to register primitive into execution queue */
-  void Submit(const dnnl::primitive& prim,
-              const std::unordered_map<int, TensorRequisite>& tr_args) {
+  void Submit(const dnnl::primitive& prim, const std::unordered_map<int, TensorRequisite>& tr_args,
+              const std::pair<TensorRequisite, int>& inplace_conf = {}) {
     // Register all provided TR arguments
     std::unordered_map<int, TensorRegistry::ArgId> prim_arg_id;
     TensorRegistry::ActionQue post_prim_actions;
@@ -706,6 +768,18 @@ class DNNLJSONRuntime : public JSONRuntimeBase {
       prim_arg_id[key] = arg_id;
     }
 
+    // Simulate inplace primitive
+    if (auto tr = inplace_conf.first) {
+      auto arg_id = tensor_registry_.Register(tr, &net_);
+      auto dst_tr = tr_args.at(inplace_conf.second);
+      auto dst_arg_id = prim_arg_id.at(inplace_conf.second);
+
+      // Register copy action direct before main primitive
+      dnnl::reorder::primitive_desc io_copy_pd(engine_, tr.desc(), engine_, dst_tr.desc());
+      net_.push_back(
+          {dnnl::reorder(io_copy_pd), {{DNNL_ARG_SRC, arg_id}, {DNNL_ARG_DST, dst_arg_id}}});
+    }
+
     // Register main primitive
     net_.push_back({prim, prim_arg_id});
 
diff --git a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
index d02ceff5de823..bad4bc10edec3 100644
--- a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
+++ b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h
@@ -275,6 +275,7 @@ class TensorRequisite {
    *               innermost.
    */
   TensorRequisite TreatAs(const std::string& layout, std::string desired_logic_layout = "") const {
+    if (!defined()) return *this;
     if (desired_logic_layout.empty()) desired_logic_layout = DefaultLogicLayoutFor(layout);
 
     const auto origin_dims = dims();
diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc
index a191f816f7159..b5c560e255e3b 100644
--- a/src/runtime/pipeline/pipeline_executor.cc
+++ b/src/runtime/pipeline/pipeline_executor.cc
@@ -34,6 +34,9 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
   if (name == "get_num_outputs") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumOutputs(); });
+  } else if (name == "get_num_inputs") {
+    return PackedFunc(
+        [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); });
   } else if (name == "get_input_pipeline_map") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       if (String::CanConvertFrom(args[0])) {
@@ -87,7 +90,10 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name,
     return PackedFunc();
   }
 }
-
+/*!
+ * brief Returns number of global inputs.
+ */
+int PipelineExecutor::NumInputs(void) { return input_connection_config_.GetInputNum(); }
 /*!
  * \brief set input to the runtime module.
  * \param input_name The input name.
diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h
index 9f9b24bdf0bec..87b50ed3a1a90 100644
--- a/src/runtime/pipeline/pipeline_executor.h
+++ b/src/runtime/pipeline/pipeline_executor.h
@@ -115,6 +115,7 @@ class TVM_DLL PipelineExecutor : public ModuleNode {
   int NumOutputs() const { return num_outputs_; }
   /*!\brief Run the pipeline executor.*/
   void Run();
+  int NumInputs();
   /*!
    * \brief Get a list output data.
    * \return A list of output data.
diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h
index 2cb7b4a6d24e5..540103d0186cb 100644
--- a/src/runtime/pipeline/pipeline_struct.h
+++ b/src/runtime/pipeline/pipeline_struct.h
@@ -560,6 +560,9 @@ struct InputConnectionConfig {
     }
     return input_connection[key];
   }
+  /*!\brief Returns the number of global inputs through the input_runtime_map list size.*/
+  int GetInputNum() { return input_runtime_map.size(); }
+
   /*!
    * \brief Getting the global input index through the input name.
    * \param input_name The global input name.
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 2651435434b11..05c8daa20c215 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -17,7 +17,6 @@
 import logging
 import os
 import pathlib
-import sys
 import logging
 
 import pytest
diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
index fec18c197e045..26604da0a64aa 100644
--- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py
+++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py
@@ -101,7 +101,7 @@ def make_model(
 def test_op_int8(
     op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point
 ):
-    """Tests QNN Conv2D operator for CMSIS-NN"""
+    """Tests QNN binary operator for CMSIS-NN"""
     interface_api = "c"
     use_unpacked_api = True
     test_runner = AOT_USMP_CORSTONE300_RUNNER
@@ -145,6 +145,65 @@ def test_op_int8(
     )
 
 
+@skip_if_no_reference_system
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("op", [relay.qnn.op.mul, relay.qnn.op.add])
+@pytest.mark.parametrize("relu_type", ["RELU", "NONE"])
+def test_same_input_to_binary_op(op, relu_type):
+    """Tests QNN binary operator for CMSIS-NN where both inputs are the same"""
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_USMP_CORSTONE300_RUNNER
+
+    dtype = "int8"
+    shape = [1, 16, 16, 3]
+    input_ = generate_variable("input")
+    input_scale = 0.256
+    input_zero_point = 33
+
+    model = make_model(
+        op,
+        input_,
+        input_,
+        input_scale,
+        input_zero_point,
+        input_scale,
+        input_zero_point,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # Check if the number of internal function parameter is 1
+    cmsisnn_global_func = cmsisnn_mod["tvmgen_default_cmsis_nn_main_0"]
+    assert (
+        isinstance(cmsisnn_global_func.body, tvm.relay.expr.Call)
+        and len(cmsisnn_global_func.body.args) == 1
+    ), "Composite function for the binary op should have only 1 parameter."
+
+    # validate the output
+    in_min, in_max = get_range_for_dtype_str(dtype)
+    inputs = {
+        "input": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype),
+    }
+    output_list = generate_ref_data(orig_mod["main"], inputs)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 def parameterize_for_constant_inputs(test):
     """Generates parameters in such a way so that at least one of the inputs is a constant,
     both can't be variables, both can't be scalars.
diff --git a/tests/python/contrib/test_cmsisnn/test_extract_constants.py b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
index 8831596d40e63..7d3e81a9c79d2 100644
--- a/tests/python/contrib/test_cmsisnn/test_extract_constants.py
+++ b/tests/python/contrib/test_cmsisnn/test_extract_constants.py
@@ -116,6 +116,40 @@ def test_nested_function():
     relay.transform.InferType()(mod)
 
 
+@tvm.testing.requires_cmsisnn
+def test_internal_function_with_duplicate_arguments():
+    """Tests the pass ExternConstants when a composite function
+    is present within global function with repeating arguments
+    to one of the binary ops.
+    """
+    input0 = relay.var("input0", shape=(8, 8))
+    binary_op0 = input0 + input0
+    binary_op1 = binary_op0 * relay.const(5.0, "float32")
+    local_func = relay.Function([input0], binary_op1, relay.TensorType((8, 8), "float32"))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn")
+
+    arg = relay.var("arg", shape=(8, 8))
+    call_local_func = relay.Call(local_func, [arg])
+    extern_func = relay.Function([arg], call_local_func, relay.TensorType((8, 8), "float32"))
+
+    global_arg = relay.var("global_var", shape=(8, 8))
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [global_arg])
+    main_func = relay.Function([global_arg], call_extern_func, relay.TensorType((8, 8), "float32"))
+    main_var = relay.GlobalVar("main")
+
+    mod = tvm.IRModule()
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
+
+    mod = ExtractConstantsFromPartitionedFunction()(mod)
+    constant_verifier = CheckFunctionsForConstants()
+    constant_verifier.visit_function(mod[global_var])
+    constant_verifier.check_num_constants()
+    relay.transform.InferType()(mod)
+
+
 @tvm.testing.requires_cmsisnn
 def test_multiple_functions():
     """Tests the pass ExternConstants when global function
diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
index 557a65aeffcaf..df54f7ce55f15 100644
--- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
+++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py
@@ -256,6 +256,47 @@ def test_all_primary_operands_tensor_constants():
     assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
 
 
+@tvm.testing.requires_cmsisnn
+def test_duplicate_constant_arguments():
+    """Tests the pass when repeating operands are arguments to the binary op"""
+    dtype = "int8"
+    shape = (1, 3, 3, 32)
+    operand0 = generate_variable("operand0", shape, dtype)
+    operand1 = generate_variable("operand1", shape, dtype)
+    binary_op = make_binary_op(
+        relay.qnn.op.add,
+        operand0,
+        operand0,
+        input_0_scale=0.0128,
+        input_0_zero_point=32,
+        input_1_scale=0.256,
+        input_1_zero_point=-64,
+    )
+
+    local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype))
+    local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add")
+
+    rng = np.random.default_rng(12345)
+    arg0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype))
+    call_local_func = relay.Call(local_func, [arg0, arg0])
+    extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype))
+
+    global_var = relay.GlobalVar("external_function")
+    extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint)
+    call_extern_func = relay.Call(global_var, [])
+    main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype))
+    main_var = relay.GlobalVar("main")
+
+    mod = tvm.IRModule()
+    mod[global_var] = extern_func
+    mod[main_var] = main_func
+
+    mod = relay.transform.InferType()(mod)
+    mod = ScalarToTensorConstants()(mod)
+    new_mod = relay.transform.InferType()(mod)
+    assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body)
+
+
 @tvm.testing.requires_cmsisnn
 def test_non_cmsisnn_ext_func():
     """Non CMSISNN functions should not be altered."""
diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py
index c884665421cbf..2138eda086978 100755
--- a/tests/python/contrib/test_dnnl.py
+++ b/tests/python/contrib/test_dnnl.py
@@ -20,6 +20,7 @@
 import sys
 import subprocess
 import math
+import collections
 
 import tvm
 from tvm import relay
@@ -51,7 +52,7 @@ def bf16_supported():
             cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode()
             for line in cpu_info.split("\n"):
                 if line.startswith("hw.optional.avx512f"):
-                    _bf16_supported = bool(line.split(":", 1)[1])
+                    _bf16_supported = bool(int(line.split(":", 1)[1]))
         elif sys.platform.startswith("linux"):
             _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read()
     return _bf16_supported
@@ -114,6 +115,7 @@ def partition_for_dnnl(mod, params=None, alter_layout=True, prune_subgraphs=True
 
     mod = dnnl.rewrite_layer_norm(mod)
     mod = dnnl.rewrite_dense_bias_gelu_reshape_last(mod)
+    mod = dnnl.legalize_qnn_for_dnnl(mod)
 
     byoc_seq = tvm.transform.Sequential(
         [
@@ -1126,5 +1128,540 @@ def get_graph(act=None):
     )
 
 
+def permute_shape(shape, l_from="", l_to=""):
+    res_shape = []
+    for label in l_to:
+        pos = l_from.find(label)
+        res_shape.append(shape[pos])
+
+    return res_shape
+
+
+def expand_dim(shape, rank=0):
+    assert len(shape) == 1
+    return shape + [1] * (rank - 1)
+
+
+def filler_uni(low=0, high=1):
+    def filler_func(shape):
+        return np.random.uniform(low, high, shape)
+
+    return filler_func
+
+
+class QnnBuilder:
+    def __init__(self, qnn_profile=None):
+        self._args = {}
+        self._args_op = []
+        self._qp = qnn_profile
+
+    def arg(self, shape=[], dtype="float32", filler=filler_uni(), is_const=True):
+        if isinstance(filler, (int, float)):
+            value = np.full(shape, filler).astype(dtype)
+        else:
+            value = filler(shape).astype(dtype)
+
+        if is_const:
+            res = relay.const(value, dtype=dtype)
+        else:
+            name = f"in_{len(self._args)}"
+            res = relay.var(name, shape=shape, dtype=dtype)
+            self._args[name] = value
+            self._args_op.append(res)
+
+        return res
+
+    def make_zp(self, mean_val, num_ch=1, dispersion=0.2):
+        if num_ch == 1:
+            return self.arg(shape=[], dtype="int32", filler=mean_val)
+        else:
+            low = int(mean_val * (1 - dispersion))
+            high = int(mean_val * (1 + dispersion))
+            return self.arg(shape=[num_ch], dtype="int32", filler=filler_uni(low, high))
+
+    def make_scl(self, mean_val, num_ch=1, dispersion=0.2):
+        if num_ch == 1:
+            return self.arg(shape=[], dtype="float32", filler=mean_val)
+        else:
+            low = mean_val * (1 - dispersion)
+            high = mean_val * (1 + dispersion)
+            return self.arg(shape=[num_ch], dtype="float32", filler=filler_uni(low, high))
+
+    def make_zp_and_scl(self, name, num_ch=1, dispersion=0.2):
+        is_per_channel = getattr(self._qp, f"{name}_pc")
+        zp_val = getattr(self._qp, f"{name}_zp")
+        scl_val = getattr(self._qp, f"{name}_scl")
+
+        zp = self.make_zp(zp_val, num_ch if is_per_channel else 1, dispersion)
+        scl = self.make_scl(scl_val, num_ch if is_per_channel else 1, dispersion)
+        return zp, scl
+
+    def finalize(self, op):
+        func = relay.Function(self._args_op, op)
+        mod = tvm.IRModule.from_expr(func)
+        mod = relay.transform.InferType()(mod)
+        return mod, self._args
+
+
+def check_fully_annotated(mod, desired_compiler):
+    matched_ops = []
+    other_ops = []
+
+    def _visit(node):
+        if isinstance(node, tvm.relay.Call):
+            op = node.op
+            if isinstance(op, relay.GlobalVar):
+                func = mod[op]
+                if "Compiler" in func.attrs and func.attrs["Compiler"] == desired_compiler:
+                    matched_ops.append(op)
+                    return
+            else:
+                other_ops.append(op)
+
+    tvm.relay.analysis.post_order_visit(mod["main"].body, _visit)
+
+    assert len(other_ops) == 0 and len(matched_ops) != 0, "Model is not fully DNNL compiled"
+
+
+def check_result(
+    mod,
+    ref_mod,
+    map_inputs,
+    tol=1e-5,
+    target="llvm",
+    device=tvm.cpu(),
+    params=None,
+    ref_result=None,
+    atol=None,
+    desired_compiler="dnnl",
+):
+    if atol is None:
+        atol = tol
+
+    if desired_compiler is not None:
+        check_fully_annotated(mod, desired_compiler)
+
+    if ref_result is None:
+        # Run the reference result
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            ref_lib = relay.build(ref_mod, target=target, params=params)
+        ref_rt_mod = tvm.contrib.graph_executor.GraphModule(ref_lib["default"](device))
+
+        for name, data in map_inputs.items():
+            ref_rt_mod.set_input(name, data)
+        ref_rt_mod.run()
+        out = ref_rt_mod.get_output(0)
+        ref_result = out.numpy()
+
+    def check_vm_result():
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            exe = relay.vm.compile(mod, target=target, params=params)
+        code, lib = exe.save()
+        exe = tvm.runtime.vm.Executable.load_exec(code, lib)
+        vm = tvm.runtime.vm.VirtualMachine(exe, device)
+        output = vm.run(**map_inputs)
+        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
+
+    def check_graph_executor_result():
+        relay.backend.te_compiler.get().clear()
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(mod, target=target, params=params)
+        rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](device))
+
+        rt_mod.run(**map_inputs)
+        output = rt_mod.get_output(0)
+        tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol)
+
+    check_vm_result()
+    check_graph_executor_result()
+
+
+ConvProfile = collections.namedtuple(
+    "ConvProfile",
+    [
+        "SHAPE",
+        "KER",
+        "STR",
+        "PAD",
+        "DEL",
+        "OC",
+        "GR",
+        "D_LAYOUT",
+        "K_LAYOUT",
+    ],
+)
+base_conv = ConvProfile(
+    SHAPE=[1, 8, 5, 5],
+    KER=[3, 3],
+    STR=[1, 1],
+    PAD=[1, 1],
+    DEL=[1, 1],
+    OC=16,
+    GR=1,
+    D_LAYOUT="NCHW",
+    K_LAYOUT="OIHW",
+)
+base_conv_nhwc = base_conv._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
+base_conv_dilated = base_conv._replace(PAD=[2, 2], DEL=[2, 2])
+base_conv_no_pad = base_conv._replace(PAD=[0, 0])
+base_conv_no_pad_nhwc = base_conv_no_pad._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO")
+base_conv_group_no_pad = base_conv_no_pad._replace(GR=2)
+base_conv_dw_no_pad = base_conv_no_pad._replace(SHAPE=[1, 16, 5, 5], GR=16)
+
+
+DenseProfile = collections.namedtuple("DenseProfile", ["N", "IC", "OC"])
+base_dense_profile = DenseProfile(N=2, IC=10, OC=16)
+
+ArgConstConfig = collections.namedtuple("ArgConstConfig", ["Data", "Weights", "Bias", "Sum"])
+acp_regular = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=None)
+acp_no_bias = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=None)
+acp_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=False)
+acp_no_bias_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=False)
+
+QuantizationConfig = collections.namedtuple(
+    "QuantizationConfig",
+    [
+        "d_zp",
+        "d_scl",
+        "d_pc",
+        "k_zp",
+        "k_scl",
+        "k_pc",
+        "rq_zp",
+        "rq_scl",
+        "rq_pc",
+        "sum_zp",
+        "sum_scl",
+        "sum_pc",
+        "o_zp",
+        "o_scl",
+        "o_pc",
+    ],
+)
+
+qp_regular = QuantizationConfig(
+    d_zp=0,
+    d_scl=0.2,
+    d_pc=False,
+    k_zp=0,
+    k_scl=0.1,
+    k_pc=False,
+    rq_zp=30,
+    rq_scl=0.2,
+    rq_pc=False,
+    sum_zp=15,
+    sum_scl=0.3,
+    sum_pc=False,
+    o_zp=5,
+    o_scl=0.2,
+    o_pc=False,
+)
+qp_asymmetric_data = qp_regular._replace(
+    d_zp=3, rq_zp=10, rq_scl=0.1, sum_zp=15, sum_scl=0.3, o_zp=4
+)
+
+qnn_conv_profiles = tvm.testing.parameter(
+    by_dict={
+        #  Pattern qnn.conv2d + qnn.requantize
+        "Base": (base_conv, acp_regular, qp_regular),
+        "NHWC": (base_conv_nhwc, acp_regular, qp_regular),
+        #  Asymmetric input. NOTE: No pad! Input ZP is not compatible with padding
+        "Group": (base_conv_group_no_pad, acp_regular, qp_asymmetric_data),
+        "DW": (base_conv_dw_no_pad, acp_regular, qp_asymmetric_data),
+        "NoBias": (base_conv, acp_no_bias, qp_regular),
+        "AsymmetricInput": (base_conv_no_pad, acp_regular, qp_asymmetric_data),
+        "AsymmetricInput_NHWC": (base_conv_no_pad_nhwc, acp_regular, qp_asymmetric_data),
+        #  Pattern Conv2d + Requantize + Sum
+        "WithSum": (base_conv_no_pad, acp_with_sum, qp_asymmetric_data),
+        "WithSum_NHWC": (base_conv_no_pad_nhwc, acp_with_sum, qp_asymmetric_data),
+        "WithSum_NoBias": (base_conv_no_pad, acp_no_bias_with_sum, qp_asymmetric_data),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_qnn_conv2d(qnn_conv_profiles):
+    def generate_model(p, c, q):
+        np.random.seed(0)
+
+        N, IC, IH, IW = p.SHAPE
+        d_shape = p.SHAPE
+        w_shape = [p.OC, IC, *p.KER]
+        b_shape = [p.OC]
+        s_shape = [
+            p.SHAPE[0],
+            p.OC,
+            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
+            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
+        ]
+
+        if p.GR != 1:
+            w_shape[1] //= p.GR
+
+        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
+
+        c_dim = p.D_LAYOUT.find("C")
+        b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
+
+        bld = QnnBuilder(qnn_profile=q)
+
+        # Start build a test graph
+        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
+        d_zp, d_scl = bld.make_zp_and_scl("d", IC)
+
+        # Convolution
+        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
+        w_zp, w_scl = bld.make_zp_and_scl("k")
+
+        op = tvm.relay.qnn.op.conv2d(
+            data,
+            wgh,
+            d_zp,
+            w_zp,
+            d_scl,
+            w_scl,
+            kernel_size=p.KER,
+            padding=p.PAD,
+            strides=p.STR,
+            dilation=p.DEL,
+            groups=p.GR,
+            channels=p.OC,
+            out_dtype="int32",
+            data_layout=p.D_LAYOUT,
+            kernel_layout=p.K_LAYOUT,
+        )
+        # Optional bias
+        if c.Bias is not None:
+            bias = bld.arg(
+                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
+            )
+            op = tvm.relay.add(op, bias)
+
+        # Re-quantization
+        rq_in_zp = bld.make_zp(0)
+        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
+        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
+
+        op = tvm.relay.qnn.op.requantize(
+            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
+        )
+        op = tvm.relay.clip(
+            op, a_min=0.0, a_max=255.0
+        )  # pytorch frontend specific, I guess it's redundant
+        op = tvm.relay.cast(op, dtype="uint8")
+
+        # Optional sum (ResNet like)
+        if c.Sum is not None:
+            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
+
+            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
+            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
+            out_zp, out_scl = bld.make_zp_and_scl("o")
+
+            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
+            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p, quant_p = qnn_conv_profiles
+    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
+    mod = partition_for_dnnl(ref_mod)
+
+    # atol=1 means int values should match with +-1 quantum value tolerance
+    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
+
+
+conv_profiles = tvm.testing.parameter(
+    by_dict={
+        "Base": (base_conv, acp_regular),
+        "NHWC": (base_conv_nhwc, acp_regular),
+        "Group": (base_conv_group_no_pad, acp_regular),
+        "DW": (base_conv_dw_no_pad, acp_regular),
+        "Dilated": (base_conv_dilated, acp_regular),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_conv2d_plus(conv_profiles):
+    def generate_model(p, c):
+        np.random.seed(0)
+
+        N, IC, IH, IW = p.SHAPE
+        d_shape = p.SHAPE
+        w_shape = [p.OC, IC, *p.KER]
+        b_shape = [p.OC]
+        s_shape = [
+            p.SHAPE[0],
+            p.OC,
+            (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1,
+            (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1,
+        ]
+
+        if p.GR != 1:
+            w_shape[1] //= p.GR
+
+        d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT)
+        w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT)
+
+        c_dim = p.D_LAYOUT.find("C")
+        # b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim)
+
+        bld = QnnBuilder()
+
+        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
+        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
+        op = tvm.relay.nn.conv2d(
+            op,
+            wgh,
+            kernel_size=p.KER,
+            padding=p.PAD,
+            strides=p.STR,
+            dilation=p.DEL,
+            groups=p.GR,
+            channels=p.OC,
+            out_dtype="float32",
+            data_layout=p.D_LAYOUT,
+            kernel_layout=p.K_LAYOUT,
+        )
+
+        if c.Bias is not None:
+            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
+            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
+
+        if c.Sum is not None:
+            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
+            op = tvm.relay.op.add(op, sum_in)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p = conv_profiles
+    ref_mod, args = generate_model(conv_p, arg_p)
+    mod = partition_for_dnnl(ref_mod, alter_layout=False)
+    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
+
+
+qnn_dense_profiles = tvm.testing.parameter(
+    by_dict={
+        #  Pattern Dense + Requantize
+        "Base": (base_dense_profile, acp_regular, qp_regular),
+        "AsymmetricInput": (base_dense_profile, acp_regular, qp_asymmetric_data),
+        #  Pattern Dense + Requantize + Sum
+        "AsymmetricInput_Sum": (base_dense_profile, acp_with_sum, qp_asymmetric_data),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_qnn_dense(qnn_dense_profiles):
+    def generate_model(p, c, q):
+        np.random.seed(0)
+
+        d_shape = [p.N, p.IC]
+        w_shape = [p.OC, p.IC]
+        b_shape = [p.OC]
+        s_shape = [p.N, p.OC]
+
+        bld = QnnBuilder(qnn_profile=q)
+
+        # Start build a test graph
+        data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20))
+        d_zp, d_scl = bld.make_zp_and_scl("d", p.IC)
+
+        # Convolution
+        wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20))
+        w_zp, w_scl = bld.make_zp_and_scl("k")
+
+        op = tvm.relay.qnn.op.dense(
+            data, wgh, d_zp, w_zp, d_scl, w_scl, units=p.OC, out_dtype="int32"
+        )
+        # Optional bias
+        if c.Bias is not None:
+            bias = bld.arg(
+                shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50)
+            )
+            op = tvm.relay.add(op, bias)
+
+        # Re-quantization
+        rq_in_zp = bld.make_zp(0)
+        rq_in_scl = bld.make_scl(q.d_scl * q.k_scl)  # in real cases that should be a vector
+        rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq")
+
+        op = tvm.relay.qnn.op.requantize(
+            op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32"
+        )
+        op = tvm.relay.clip(
+            op, a_min=0.0, a_max=255.0
+        )  # pytorch frontend specific, I guess it's redundant
+        op = tvm.relay.cast(op, dtype="uint8")
+
+        # Optional sum (ResNet like)
+        if c.Sum is not None:
+            sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum)
+
+            lhs_zp, lhs_scl = bld.make_zp_and_scl("rq")
+            rhs_zp, rhs_scl = bld.make_zp_and_scl("sum")
+            out_zp, out_scl = bld.make_zp_and_scl("o")
+
+            op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp)
+            op = tvm.relay.clip(op, a_min=0.0, a_max=255.0)
+
+        return bld.finalize(op)
+
+    conv_p, arg_p, quant_p = qnn_dense_profiles
+    ref_mod, args = generate_model(conv_p, arg_p, quant_p)
+    mod = partition_for_dnnl(ref_mod)
+
+    # atol=1 means int values should match with +-1 quantum value tolerance
+    check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl")
+
+
+dense_profiles = tvm.testing.parameter(
+    by_dict={
+        "Base": (base_dense_profile, acp_regular),
+        "WithSum": (base_dense_profile, acp_with_sum),
+    }
+)
+
+
+@has_dnnl_codegen
+def test_dense_plus(dense_profiles):
+    def generate_model(p, c):
+        np.random.seed(0)
+
+        d_shape = [p.N, p.IC]
+        w_shape = [p.OC, p.IC]
+        b_shape = [p.OC]
+        s_shape = [p.N, p.OC]
+
+        c_dim = 1
+
+        bld = QnnBuilder()
+
+        op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data)
+        wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights)
+        op = tvm.relay.nn.dense(op, wgh, out_dtype="float32")
+
+        if c.Bias is not None:
+            bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias)
+            op = tvm.relay.nn.bias_add(op, bias, axis=c_dim)
+
+        if c.Sum is not None:
+            sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum)
+            op = tvm.relay.op.add(op, sum_in)
+
+        return bld.finalize(op)
+
+    dense_p, arg_p = dense_profiles
+    ref_mod, args = generate_model(dense_p, arg_p)
+    mod = partition_for_dnnl(ref_mod)
+    check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl")
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 113c7780c130f..e581c3d55d216 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -16,6 +16,57 @@
 # under the License.
 
 import csv
+import os
+import pytest
+import tempfile
+import collections
+
+
+def skip_bencharks_flag_and_reason():
+    """
+    Returns one of these tuples:
+        (False, '') or
+        (True, (a string describing why the test should be skipped))
+
+    NOTE: This function is a temporary measure to prevent the TVM CI system
+    running benchmark scripts every time the CI pre-commit hook executes.
+    This should go away when a better system is in place to govern when various
+    tests / benchmarks are executed.
+    """
+    asn = os.environ.get("ANDROID_SERIAL_NUMBER")
+
+    if asn == "simulator":
+        return (True, "Skipping benchmarks when  ANDROID_SERIAL_NUMBER='simluator'")
+    else:
+        return (False, "")
+
+
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
+
+
+class UnsupportedException(Exception):
+    """
+    Indicates that the specified benchmarking configuration is known to
+    currently be unsupported.  The Exception message may provide more detail.
+    """
+
+
+class NumericalAccuracyException(Exception):
+    """
+    Indicates that the benchmarking configuration appeared to run successfully,
+    but the output data didn't have the expected accuracy.
+    """
 
 
 class BenchmarksTable:
@@ -173,3 +224,62 @@ def get_benchmark_decription(keys_dict):
     other characters that make it unsuitable for use as a filename.
     """
     return " ".join([f"{k}={v}" for k, v in keys_dict.items()])
+
+
+# This fixture provides some initialization / finalization logic for groups of related
+# benchmark runs.
+# See the fixture implementation below for details.
+#
+# The fixture's mechanics are described here: https://stackoverflow.com/a/63047695
+#
+# TODO: There may be cleaner ways to let each class that uses this fixture provide its
+# own value for `csv_column_order`.
+#
+# TODO: In the future we may wish to break this fixture up in to several smaller ones.
+#
+# The overall contract for a class (e.g. `MyTest`) using this fixture is as follows:
+#
+#    https://stackoverflow.com/a/63047695
+#
+#    @pytest.mark.usefixtures("benchmark_group")
+#    class MyTest:
+#
+#       # The fixture requires that this class variable is defined before
+#       # the fixture's finalizer-logic executes.
+#       #
+#       # This is used as an argument to BenchmarkTable.print_csv(...) after
+#       # all of MyTest's unit tests have executed.
+#       csv_column_order = [
+#          ...
+#          ]
+#
+#       # Before the MyTest's first unit test executes, the fixture will populate the
+#       # following class variables:
+#       MyTest.working_dir     : str
+#       MyTest.benchmark_table : BenchmarkTable
+@pytest.fixture(scope="class")
+def benchmark_group(request):
+    working_dir = tempfile.mkdtemp()
+    bt = BenchmarksTable()
+
+    request.cls.working_dir = working_dir
+    request.cls.benchmark_table = bt
+
+    yield
+
+    tabular_output_filename = os.path.join(working_dir, "benchmark-results.csv")
+
+    if not hasattr(request.cls, "csv_column_order"):
+        raise Exception('Classes using this fixture must have a member named "csv_column_order"')
+
+    with open(tabular_output_filename, "w") as csv_file:
+        bt.print_csv(csv_file, request.cls.csv_column_order)
+
+    print()
+    print("*" * 80)
+    print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
+    print("*" * 80)
+    print()
+
+    if bt.has_fail() > 0:
+        pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
similarity index 93%
rename from tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
rename to tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
index 70266d7939bc5..f7f5f3e176e46 100644
--- a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py
+++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py
@@ -28,7 +28,9 @@
 from tvm.script import tir as T
 from tvm import te
 from tvm.contrib.hexagon.build import HexagonLauncherRPC
-from . import benchmark_util
+from . import benchmark_util as bu
+
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
 
 # This is a fixed detail of the v68 architecture.
 HVX_VECTOR_BYTES = 128
@@ -43,7 +45,7 @@
 # triggering TIME_WAIT state on the server socket. This prevents another
 # server to bind to the same port until the wait time elapses.
 
-_BT = benchmark_util.BenchmarksTable()
+_BT = bu.BenchmarksTable()
 
 _CSV_COLUMN_ORDER = [
     # Identifies which TE-compute / TIRScript is used as the basis for the
@@ -88,21 +90,6 @@
 print("-" * 80)
 print()
 
-
-class UnsupportedException(Exception):
-    """
-    Indicates that the specified benchmarking configuration is known to
-    currently be unsupported.  The Exception message may provide more detail.
-    """
-
-
-class NumericalAccuracyException(Exception):
-    """
-    Indicates that the benchmarking configuration appeared to run successfully,
-    but the output data didn't have the expected accuracy.
-    """
-
-
 from typing import Tuple
 
 
@@ -129,7 +116,7 @@ def _get_irmod_elemwise_add(
     dtype_str = str(dtype)
 
     if mem_scope == "global.vtcm":
-        raise UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
+        raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.")
 
         # This check is currently elided by the one above, but it should become relevant as soon
         # as we add VTCM support to this kernel generator.
@@ -147,7 +134,7 @@ def _get_irmod_elemwise_add(
         estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors
 
         if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes:
-            raise UnsupportedException("Expect to exceed VTCM budget.")
+            raise bu.UnsupportedException("Expect to exceed VTCM budget.")
 
     @tvm.script.ir_module
     class BenchmarkModule:
@@ -190,10 +177,10 @@ def _benchmark_hexagon_elementwise_add_kernel(
         "mem_scope": mem_scope,
     }
 
-    desc = benchmark_util.get_benchmark_decription(keys_dict)
+    desc = bu.get_benchmark_decription(keys_dict)
 
     # Create the host-side directory for this benchmark run's files / logs...
-    host_files_dir_name = benchmark_util.get_benchmark_id(keys_dict)
+    host_files_dir_name = bu.get_benchmark_id(keys_dict)
     host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name)
     os.mkdir(host_files_dir_path)
 
@@ -238,7 +225,9 @@ def _benchmark_hexagon_elementwise_add_kernel(
             # Upload the .so to the Android device's file system (or wherever is appropriate
             # when using the Hexagon simulator)...
             target_dso_binary_filename = "test_binary.so"
-            hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename)
+            target_dso_binary_pathname = hexagon_launcher.upload(
+                host_dso_binary_path, target_dso_binary_filename
+            )
 
             # Generate our testing / validation data...
             (
@@ -251,7 +240,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
                 # On the target device / simulator, make our Hexagon-native shared object
                 # available for use...
                 loaded_hexagon_module: tvm.runtime.module.Module = hexagon_launcher.load_module(
-                    target_dso_binary_filename, sess
+                    target_dso_binary_pathname, sess
                 )
 
                 # Create the target-side tensors to hold the primfunc's inputs and outputs...
@@ -296,11 +285,11 @@ def _benchmark_hexagon_elementwise_add_kernel(
                         result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance
                     )
                 except AssertionError as e:
-                    raise NumericalAccuracyException(str(e))
+                    raise bu.NumericalAccuracyException(str(e))
 
                 _BT.record_success(timing_result, **keys_dict)
 
-        except NumericalAccuracyException as e:
+        except bu.NumericalAccuracyException as e:
             print()
             print(f"FAIL: Numerical accuracy error. See log file.")
 
@@ -309,7 +298,7 @@ def _benchmark_hexagon_elementwise_add_kernel(
 
             _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.")
 
-        except UnsupportedException as e:
+        except bu.UnsupportedException as e:
             print()
             print(f"SKIP: {e}")
 
@@ -381,6 +370,7 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str):
     ]
 
 
+@pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
 @tvm.testing.requires_hexagon
 def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
     for dtype in [
@@ -432,3 +422,7 @@ def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC):
 
     if _BT.has_fail() > 0:
         pytest.fail("At least one benchmark configuration failed", pytrace=False)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
new file mode 100644
index 0000000000000..41169494417a2
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+This module serves two purposes:
+    (1) Demonstrates how to write Python code that exercises various
+        Hexagon-related algorithms / features.
+
+    (2) Benchmark the resulting primfuncs.
+
+Current limitations:
+    - Input shapes are limited to NHWC --> NHWC_8h8w32c.
+
+    - Testing parameters (input shapes, dtypes, etc.) currently
+      support only one value for each parameter.
+
+    - H, W, C must be integer multiples of 8, 8, and 32,
+      respectively.  I.e., partial blocks aren't currently
+      supported by this script.
+
+    - Requires that I/O tensors reside in "global.VTCM" memory,
+      rather than "global" memory.
+      This prevents benchmarking with I/O tensors that are too
+      large to fit into availble VTCM.
+
+    - The script only develops one primfunc.
+      Future revisions to this script are expected to add more
+      primfuncs and demonstrate more coding strategies.
+"""
+
+import sys
+import pytest
+import numpy as np
+import copy
+import os
+
+import tvm.testing
+from tvm import te, topi, tir
+from tvm.topi import testing
+from tvm.script import tir as T
+from tvm.tir import IndexMap
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.session import Session
+from typing import List
+
+from .infrastructure import allocate_hexagon_array
+from . import benchmark_util as bu
+from .benchmark_util import benchmark_group
+
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
+
+
+def _ceil_div(numerator, denominator):
+    return (numerator + (denominator - 1)) // denominator
+
+
+def _int8_nhwc_8h8w32c_map(n, h, w, c):
+    return [
+        n,
+        h // 8,
+        w // 8,
+        c // 32,
+        te.AXIS_SEPARATOR,
+        h % 8,
+        w % 8,
+        c % 32,
+    ]
+
+
+def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]:
+    return [
+        n,
+        _ceil_div(h, 8),
+        _ceil_div(w, 8),
+        _ceil_div(c, 32),
+        8,
+        8,
+        32,
+    ]
+
+
+def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
+    """
+    Return a deep copy of 'arr_in', transformed from a NWHC to
+    NHWC-8h8wc32 shape.  Any newly created array elements have value 0.
+    """
+    stage1 = copy.copy(arr_in)
+
+    (
+        n,
+        h,
+        w,
+        c,
+    ) = stage1.shape
+
+    (
+        h_minor,
+        w_minor,
+        c_minor,
+    ) = [8, 8, 32]
+
+    h_major = _ceil_div(h, h_minor)
+    w_major = _ceil_div(w, w_minor)
+    c_major = _ceil_div(c, c_minor)
+
+    # This handles cases where the dimensions of arr_in are not cleanly divided
+    # by the minor block size, i.e. [8, 8, 32].
+    #
+    # Any additional array elements that this creates will ahve value 0.
+    # We shouldn't actually care what value is used for those elements, because they
+    # shouldn't be treated as meaningful by any of our algorithms.
+    if (h % h_minor) or (w % w_minor) or (c % c_minor):
+        stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False)
+
+    stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor)
+    stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6)
+    return stage3
+
+
+def _create_test_input(shape, dtype: str) -> np.ndarray:
+    np_dtype = np.dtype(dtype)
+    min_value = np.iinfo(np_dtype).min
+    max_value = np.iinfo(np_dtype).max
+    return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8)
+
+
+@pytest.mark.usefixtures("benchmark_group")
+class TestMaxPool2D:
+    csv_column_order = [
+        # Identifies which TE-compute / TIRScript is used as the basis for the
+        # benchmarked primfunc. Only needs to be meaningful to humans.
+        "basic_kernel",
+        # When applicable, indicates the particular variation of schedules
+        # apply by the Python code. Decoding this may require looking at this
+        # script's source code.
+        "sched_type",
+        # Values directly based on test parameters...
+        "input_shape_4d",
+        "block_shape",
+        "DTYPE",
+        "KERNEL",
+        "STRIDE",
+        "DILATION",
+        "PADDING",
+        "IO_TENSOR_MEM_SCOPE",
+        # Reserved columns defined by the BenchmarksTable class.
+        "row_status",
+        "timings_min_usecs",
+        "timings_max_usecs",
+        "timings_median_usecs",
+        "timings_mean_usecs",
+        "timings_stddev_usecs",
+        # For benchmarks that produce files on the host file system, this indicates
+        # their location. Useful for post-mortem investigation of benchmark results.
+        "host_files_dir_path",
+        # Miscellaneous comments about the benchmark.
+        "comments",
+    ]
+
+    DTYPE = tvm.testing.parameter("int8")
+
+    # FIXME(cconvey): The script currently fails when H, W, or C is not an
+    # integer multiple of 8, 8, or 32, respectively.
+    N = tvm.testing.parameter(1)
+    H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+    C = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
+
+    KERNEL = tvm.testing.parameter((1, 1), (3, 3))
+    STRIDE = tvm.testing.parameter((1, 1))
+    DILATION = tvm.testing.parameter((1, 1))
+    PADDING = tvm.testing.parameter((0, 0, 0, 0))
+    IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm")
+
+    @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
+    @tvm.testing.requires_hexagon
+    def test_maxpool2d_nhwc(
+        self,
+        N,
+        H,
+        W,
+        C,
+        DTYPE,
+        KERNEL,
+        STRIDE,
+        DILATION,
+        PADDING,
+        IO_TENSOR_MEM_SCOPE,
+        hexagon_session: Session,
+    ):
+        keys_dict = {
+            "basic_kernel": "max_pool2d",
+            "sched_type": 1,
+            "input_shape_4d": [N, H, W, C],
+            "block_shape": [8, 8, 32],
+            "DTYPE": DTYPE,
+            "KERNEL": KERNEL,
+            "STRIDE": STRIDE,
+            "DILATION": DILATION,
+            "PADDING": PADDING,
+            "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE,
+        }
+
+        desc = bu.get_benchmark_decription(keys_dict)
+
+        # Create the host-side directory for this benchmark run's files / logs...
+        host_files_dir_name = bu.get_benchmark_id(keys_dict)
+        host_files_dir_path = os.path.join(self.working_dir, host_files_dir_name)
+        os.mkdir(host_files_dir_path)
+
+        keys_dict["host_files_dir_path"] = host_files_dir_path
+
+        log_file_path = os.path.join(host_files_dir_path, "out.txt")
+        with open(log_file_path, "w") as log_file:
+            print(f"CONFIGURATION: {desc}")
+            log_file.write(f"CONFIGURATION: {desc}\n")
+
+            try:
+                input_tensor_shape_4d = [N, H, W, C]
+                input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C)
+
+                data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE)
+
+                output = topi.nn.pool2d(
+                    data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC"
+                )
+                primfunc = te.create_prim_func([data, output])
+
+                sch = tir.Schedule(primfunc, debug_mask="all")
+
+                sch.transform_layout(
+                    block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map
+                )
+
+                target_hexagon = tvm.target.hexagon("v69", link_params=True)
+                # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon))
+                built_module = tvm.build(
+                    sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+                )
+
+                # Save a local copy of the Hexagon object code (in the form of a .so file)
+                # to allow post-mortem inspection.
+                host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
+                built_module.save(host_dso_binary_path)
+                print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")
+
+                hexagon_mod = hexagon_session.load_module(built_module)
+
+                # Generate the input tensor's data.
+                # Note that we'll eventually need it in two different layouts:
+                # (1) NHWC as an argument to testing.poolnd_python.
+                # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc.
+                # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8)
+                a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE)
+
+                ref_output_4d = testing.poolnd_python(
+                    a_numpy_4d.astype("int32"),
+                    KERNEL,
+                    STRIDE,
+                    DILATION,
+                    PADDING[0:2],
+                    PADDING[2:],
+                    pool_type="max",
+                    dtype="int32",
+                    layout="NHWC",
+                ).astype(DTYPE)
+
+                output_tensor_shape_4d = ref_output_4d.shape
+
+                a_numpy_7d = _int8_nhwc_8h8w32c_xform_immediate(a_numpy_4d)
+
+                a_hexagon_7d = allocate_hexagon_array(
+                    hexagon_session.device,
+                    tensor_shape=input_tensor_shape_7d,
+                    axis_separators=[4],
+                    dtype=DTYPE,
+                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                )
+
+                c_hexagon_4d = allocate_hexagon_array(
+                    hexagon_session.device,
+                    tensor_shape=output_tensor_shape_4d,
+                    axis_separators=[],
+                    dtype=DTYPE,
+                    mem_scope=IO_TENSOR_MEM_SCOPE,
+                )
+
+                a_hexagon_7d.copyfrom(a_numpy_7d)
+
+                if DTYPE == "int8":
+                    rel_tolerance = 0
+                    abs_tolerance = 0
+                else:
+                    assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}"
+
+                # hexagon_mod(a_hexagon_7d, c_hexagon_4d)
+                # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance)
+
+                timer = hexagon_mod.time_evaluator(
+                    "main", hexagon_session.device, number=10, repeat=1
+                )
+                timing_result = timer(a_hexagon_7d, c_hexagon_4d)
+
+                try:
+                    tvm.testing.assert_allclose(
+                        ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance
+                    )
+                except AssertionError as e:
+                    raise bu.NumericalAccuracyException(str(e))
+
+            except bu.NumericalAccuracyException as e:
+                print()
+                print(f"FAIL: Numerical accuracy error. See log file.")
+
+                log_file.write("\n")
+                log_file.write(f"FAIL: {e}\n")
+
+                self.benchmark_table.record_fail(
+                    **keys_dict, comments=f"Numerical accuracy error. See log file."
+                )
+
+            except bu.UnsupportedException as e:
+                print()
+                print(f"SKIP: {e}")
+
+                log_file.write("\n")
+                log_file.write(f"SKIP: {e}\n")
+
+                self.benchmark_table.record_skip(
+                    **keys_dict, comments=f"Unsupported configuration: {e}"
+                )
+
+            self.benchmark_table.record_success(timing_result, **keys_dict)
+
+
+if __name__ == "__main__":
+    tvm.testing.main()
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 1a4f23ad467a5..987d425aa63d4 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -44,6 +44,7 @@
     create_relay_module_and_inputs_from_tflite_file,
 )
 from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options
+from tvm.micro.testing.utils import get_conv2d_relay_module
 
 
 def test_error_c_interface_with_packed_api():
@@ -76,22 +77,7 @@ def test_error_c_interface_with_packed_api():
 @parametrize_aot_options
 def test_conv_with_params(interface_api, use_unpacked_api, test_runner):
     """Tests compilation of convolution with parameters"""
-    relay_model = """
-#[version = "0.0.5"]
-def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-    %1 = nn.conv2d(
-         %data,
-         %weight,
-         padding=[2, 2],
-         channels=8,
-         kernel_size=[5, 5],
-         data_layout="NCHW",
-         kernel_layout="OIHW",
-         out_dtype="int32");
-  %1
-}
-"""
-    mod = tvm.parser.fromtext(relay_model)
+    mod = get_conv2d_relay_module()
     main_func = mod["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
@@ -576,23 +562,7 @@ def test_multiple_models(interface_api, use_unpacked_api, test_runner):
     params1 = None
 
     # Convolution model
-    relay_model = """
-    #[version = "0.0.5"]
-    def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
-        %1 = nn.conv2d(
-            %data,
-            %weight,
-            padding=[2, 2],
-            channels=8,
-            kernel_size=[5, 5],
-            data_layout="NCHW",
-            kernel_layout="OIHW",
-            out_dtype="int32");
-    %1
-    }
-    """
-
-    mod2 = tvm.parser.fromtext(relay_model)
+    mod2 = get_conv2d_relay_module()
     main_func = mod2["main"]
     shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params}
     type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params}
diff --git a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
index 31a812b38eed7..3d6690a1a16f8 100644
--- a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
+++ b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -165,4 +163,4 @@ class TestAvgPool3d(BasicPoolTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
index 0f0507cfe7d3d..b1dda10c42944 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -114,4 +112,4 @@ class TestConv1d_ncw(BasicConv1dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
index e430ade2fac14..3daed6221f68e 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -142,4 +140,4 @@ class TestConv1d_nwc(BasicConv1dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
index 3b43d37c9075f..8ca132ffba759 100644
--- a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -135,4 +133,4 @@ class TestConv2d_NCHWc(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
index 3edffba8acaa6..a69ea6c09e790 100644
--- a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
+++ b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -87,4 +85,4 @@ class TestDense(BasicDenseTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
index 96628a6371d00..ee0d51c321f79 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
index 69e9ab09e4c95..178b44edbd403 100644
--- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
+++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -118,4 +116,4 @@ class TestDepthWiseConv2d_NCHWc(BasicConv2dTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
index b24c651de988f..47fe6d9f74c25 100644
--- a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
+++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
diff --git a/tests/python/relay/strategy/arm_cpu/test_max_pool.py b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
index f58a041ecb746..ee890261d1b42 100644
--- a/tests/python/relay/strategy/arm_cpu/test_max_pool.py
+++ b/tests/python/relay/strategy/arm_cpu/test_max_pool.py
@@ -14,10 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from pickle import FALSE
-import sys
 import numpy as np
-import pytest
 import tvm
 import tvm.testing
 from tvm import relay
@@ -132,4 +129,4 @@ class TestMaxPool3d(BasicPoolTests):
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    tvm.testing.main()
diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py
index dedeae56e9daf..58b41189a0f0c 100644
--- a/tests/python/relay/test_pass_partition_graph.py
+++ b/tests/python/relay/test_pass_partition_graph.py
@@ -926,11 +926,11 @@ def test_dnnl_fuse():
         conv2d_relu_pat,
         conv2d_sigmoid_pat,
     ) = (
-        dnnl_patterns[1],
-        dnnl_patterns[13],
-        dnnl_patterns[20],
-        dnnl_patterns[26],
-        dnnl_patterns[38],
+        dnnl_patterns[3],
+        dnnl_patterns[15],
+        dnnl_patterns[22],
+        dnnl_patterns[28],
+        dnnl_patterns[40],
     )
 
     def get_blocks(
diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py
index 541f3bba13da2..06614977d4caf 100644
--- a/tests/python/relay/test_pipeline_executor.py
+++ b/tests/python/relay/test_pipeline_executor.py
@@ -595,6 +595,8 @@ def test_pipeline():
                 if input_map[0] == "0":
                     input_data = pipeline_module_test.get_input("data_a")
                     tvm.testing.assert_allclose(data, input_data.numpy())
+
+                assert pipeline_module_test.num_inputs == 2
                 # Running the pipeline executor in the pipeline mode.
                 pipeline_module_test.run()
 
diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py
index afa745760895c..80c2fbaeb416c 100644
--- a/tests/python/unittest/test_link_params.py
+++ b/tests/python/unittest/test_link_params.py
@@ -264,7 +264,8 @@ def test_c_link_params(linkable_dtype):
         c_dtype = _get_c_datatype(linkable_dtype)
         src_lines = src.split("\n")
         param = param_init[f"{linkable_dtype}_a"].reshape(np.prod(KERNEL_SHAPE))
-        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) constant_\d+\[{np.prod(param.shape)}\] = {{$"
+        param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) [a-zA-Z_0-9]*constant_\d+\[{np.prod(param.shape)}\] = {{$"
+
         for i, line in enumerate(src_lines):
             if re.match(param_def, line):
                 i += 1
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index d707e6b4646b7..0caae1cdd9d46 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -15,15 +15,19 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import pathlib
+import sys
 import datetime
 import json
 import os
 import tarfile
 
-import numpy
+import numpy as np
 import pytest
 import platform
 
+pytest.importorskip("tvm.micro")
+
 import tvm
 import tvm.relay
 from tvm.relay.backend import Executor, Runtime
@@ -31,12 +35,14 @@
 import tvm.runtime.module
 import tvm.testing
 from tvm.contrib import utils
+import tvm.micro as micro
+from tvm.micro.testing.utils import get_conv2d_relay_module
+import tvm.micro.model_library_format as model_library_format
+from tvm.micro.model_library_format import _GENERATED_VERSION
 
 
 @tvm.testing.requires_micro
 def test_export_operator_model_library_format():
-    import tvm.micro as micro
-
     target = tvm.target.target.micro("host")
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         A = tvm.te.placeholder((2,), dtype="int8")
@@ -63,7 +69,7 @@ def test_export_operator_model_library_format():
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 6
+        assert metadata["version"] == _GENERATED_VERSION
         assert metadata["model_name"] == "add"
         export_datetime = datetime.datetime.strptime(
             metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
@@ -95,8 +101,35 @@ def test_export_operator_model_library_format():
             assert tir_f.read() == str(ir_mod)
 
 
+@tvm.testing.requires_micro
+def test_export_multiple_operator_model_library_format():
+    target = tvm.target.target.micro("host")
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        A = tvm.te.placeholder((2,), dtype="int8")
+        B = tvm.te.placeholder((1,), dtype="int8")
+        C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C")
+        sched = tvm.te.create_schedule(C.op)
+        mod = tvm.build(
+            sched,
+            [A, B, C],
+            tvm.target.Target(target, target),
+            runtime=Runtime("crt", {"system-lib": True}),
+            name="add",
+        )
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    with pytest.raises(RuntimeError) as exc:
+        micro.export_model_library_format([mod, mod], mlf_tar_path)
+
+        assert str(exc.exception) == ("Multiple operator is not supported.")
+
+
 def validate_graph_json(extract_dir, factory):
-    with open(os.path.join(extract_dir, "executor-config", "graph", "graph.json")) as graph_f:
+    with open(
+        os.path.join(extract_dir, "executor-config", "graph", f"{factory.libmod_name}.graph")
+    ) as graph_f:
         graph_json = graph_f.read()
         assert graph_json == factory.graph_json
 
@@ -141,12 +174,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 executor=executor,
                 runtime=runtime,
                 mod_name="add",
-                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+                params={"c": np.array([[2.0, 4.0]], dtype="float32")},
             )
 
         temp_dir = utils.tempdir()
         mlf_tar_path = temp_dir.relpath("lib.tar")
-        import tvm.micro as micro
 
         micro.export_model_library_format(factory, mlf_tar_path)
         tf = tarfile.open(mlf_tar_path)
@@ -157,21 +189,22 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 6
-            assert metadata["model_name"] == "add"
+            module_name = factory.libmod_name
+            assert metadata["version"] == _GENERATED_VERSION
+            assert metadata["modules"][module_name]["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
-                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+                metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == [str(target)]
+            assert metadata["modules"][module_name]["target"] == [str(target)]
             if str(executor) == "graph":
-                assert metadata["memory"]["sids"] == [
+                assert metadata["modules"][module_name]["memory"]["sids"] == [
                     {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                     {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                     {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                     {"storage_id": 3, "size_bytes": 8},
                 ]
-            assert metadata["memory"]["functions"]["main"] == [
+            assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
                 {
                     "constants_size_bytes": json_constants_size_bytes,
                     "device": 1,
@@ -179,12 +212,14 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                     "workspace_size_bytes": 0,
                 }
             ]
-            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-                {"device": 1, "workspace_size_bytes": 0}
-            ]
+            assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "workspace"
+            ] == [{"device": 1, "workspace_size_bytes": 0}]
             assert (
                 "fused_cast_multiply_add"
-                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+                in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                    "function_name"
+                ]
             )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "add_lib0.c"))
@@ -196,7 +231,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
         if str(executor) == "graph":
             validate_graph_json(extract_dir, factory)
 
-        with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f:
+        with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f:
             assert relay_f.read() == str(relay_mod)
 
         with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
@@ -227,12 +262,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 target,
                 runtime=Runtime("crt", {"system-lib": True}),
                 mod_name="add",
-                params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
+                params={"c": np.array([[2.0, 4.0]], dtype="float32")},
             )
 
         temp_dir = utils.tempdir()
         mlf_tar_path = temp_dir.relpath("lib.tar")
-        import tvm.micro as micro
 
         micro.export_model_library_format(factory, mlf_tar_path)
         tf = tarfile.open(mlf_tar_path)
@@ -243,20 +277,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         with open(os.path.join(extract_dir, "metadata.json")) as json_f:
             metadata = json.load(json_f)
-            assert metadata["version"] == 6
-            assert metadata["model_name"] == "add"
+            module_name = factory.libmod_name
+            assert metadata["version"] == _GENERATED_VERSION
+            assert metadata["modules"][module_name]["model_name"] == "add"
             export_datetime = datetime.datetime.strptime(
-                metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+                metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == [str(target)]
-            assert metadata["memory"]["sids"] == [
+            assert metadata["modules"][module_name]["target"] == [str(target)]
+            assert metadata["modules"][module_name]["memory"]["sids"] == [
                 {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
                 {"storage_id": 1, "size_bytes": 8, "input_binding": "b"},
                 {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"},
                 {"storage_id": 3, "size_bytes": 8},
             ]
-            assert metadata["memory"]["functions"]["main"] == [
+            assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
                 {
                     "constants_size_bytes": 8,
                     "device": 1,
@@ -264,19 +299,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                     "workspace_size_bytes": 0,
                 }
             ]
-            assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-                {"device": 1, "workspace_size_bytes": 0}
-            ]
+            assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "workspace"
+            ] == [{"device": 1, "workspace_size_bytes": 0}]
             assert (
                 "fused_cast_multiply_add"
-                in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+                in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                    "function_name"
+                ]
             )
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "add_lib0.o"))
 
         validate_graph_json(extract_dir, factory)
 
-        with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f:
+        with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f:
             assert relay_f.read() == str(relay_mod)
 
         with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f:
@@ -314,7 +351,6 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 
     temp_dir = utils.tempdir()
     mlf_tar_path = temp_dir.relpath("lib.tar")
-    import tvm.micro as micro
 
     micro.export_model_library_format(factory, mlf_tar_path)
     tf = tarfile.open(mlf_tar_path)
@@ -325,14 +361,15 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 
     with open(os.path.join(extract_dir, "metadata.json")) as json_f:
         metadata = json.load(json_f)
-        assert metadata["version"] == 6
-        assert metadata["model_name"] == "qnn_conv2d"
+        module_name = factory.libmod_name
+        assert metadata["version"] == _GENERATED_VERSION
+        assert metadata["modules"][module_name]["model_name"] == "qnn_conv2d"
         export_datetime = datetime.datetime.strptime(
-            metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
+            metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
         )
         assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-        assert metadata["target"] == [str(target)]
-        assert metadata["memory"]["functions"]["main"] == [
+        assert metadata["modules"][module_name]["target"] == [str(target)]
+        assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [
             {
                 "constants_size_bytes": 0,
                 "device": 1,
@@ -340,12 +377,14 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
                 "workspace_size_bytes": 2466816,
             }
         ]
-        assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [
-            {"device": 1, "workspace_size_bytes": 2466816}
-        ]
+        assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+            "workspace"
+        ] == [{"device": 1, "workspace_size_bytes": 2466816}]
         assert (
             "fused_nn_conv2d_add_fixed_point_multiply_clip_cast"
-            in metadata["memory"]["functions"]["operator_functions"][0]["function_name"]
+            in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][
+                "function_name"
+            ]
         )
 
 
@@ -354,11 +393,9 @@ def test_export_non_dso_exportable():
     module = tvm.support.FrontendTestModule()
 
     temp_dir = utils.tempdir()
-    import tvm.micro as micro
-    import tvm.micro.model_library_format as model_library_format
 
     with pytest.raises(micro.UnsupportedInModelLibraryFormatError) as exc:
-        model_library_format._populate_codegen_dir(module, temp_dir.relpath("codegen"))
+        model_library_format._populate_codegen_dir([module], temp_dir.relpath("codegen"))
 
         assert str(exc.exception) == (
             "Don't know how to export non-c or non-llvm modules; found: ffi_testing"
@@ -408,8 +445,6 @@ def test_export_byoc_c_module():
     temp_dir = utils.tempdir()
     mlf_tar_path = temp_dir.relpath("lib.tar")
 
-    from tvm import micro
-
     micro.export_model_library_format(factory, mlf_tar_path)
 
     with tarfile.open(mlf_tar_path, "r:*") as tf:
@@ -418,7 +453,7 @@ def test_export_byoc_c_module():
         assert "./metadata.json" in tar_members
         with tf.extractfile("./metadata.json") as f:
             metadata = json.load(f)
-        main_md = metadata["memory"]["functions"]["main"]
+        main_md = metadata["modules"][factory.libmod_name]["memory"]["functions"]["main"]
         if platform.architecture()[0] == "64bit":
             assert main_md == [
                 {
@@ -439,5 +474,140 @@ def test_export_byoc_c_module():
             ]
 
 
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_same_module_name():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("graph")
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    with pytest.raises(AssertionError, match="Multiple modules should have unique names"):
+        micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_graph():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("graph")
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    with tarfile.open(mlf_tar_path, "r:*") as tf:
+        tar_members = [ti.name for ti in tf.getmembers()]
+        print("tar members", tar_members)
+        assert "./metadata.json" in tar_members
+        assert "./codegen/host/src/mod1_lib0.c" in tar_members
+        assert "./codegen/host/src/mod2_lib0.c" in tar_members
+
+        with tf.extractfile("./metadata.json") as f:
+            metadata = json.load(f)
+        mod2_main_md = metadata["modules"]["mod2"]["memory"]["functions"]["main"]
+        assert mod2_main_md == [
+            {
+                "constants_size_bytes": 0,
+                "device": 1,
+                "io_size_bytes": 143960,
+                "workspace_size_bytes": 158088,
+            }
+        ]
+        assert metadata["modules"]["mod1"]["model_name"] == "mod1"
+        assert metadata["modules"]["mod2"]["model_name"] == "mod2"
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_c():
+    mod = get_conv2d_relay_module()
+
+    executor = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1")
+        factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2")
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c"))
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod1.h"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h"))
+
+    # check CRT runtime directory
+    assert os.path.exists(os.path.join(extract_dir, "runtime"))
+
+
+@tvm.testing.requires_micro
+def test_multiple_relay_modules_aot_graph():
+    mod = get_conv2d_relay_module()
+
+    executor1 = Executor("graph")
+    executor2 = Executor("aot", {"unpacked-api": True, "interface-api": "c"})
+    runtime = Runtime("crt")
+    target = tvm.target.target.micro("host")
+
+    with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
+        factory1 = tvm.relay.build(
+            mod, target, runtime=runtime, executor=executor1, mod_name="mod1"
+        )
+        factory2 = tvm.relay.build(
+            mod, target, runtime=runtime, executor=executor2, mod_name="mod2"
+        )
+
+    temp_dir = utils.tempdir()
+    mlf_tar_path = temp_dir.relpath("lib.tar")
+
+    micro.export_model_library_format([factory1, factory2], mlf_tar_path)
+
+    tf = tarfile.open(mlf_tar_path)
+    extract_dir = temp_dir.relpath("extract")
+    os.mkdir(extract_dir)
+    tf.extractall(extract_dir)
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib2.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c"))
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c"))
+
+    assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h"))
+
+    with open(os.path.join(extract_dir, "metadata.json")) as f:
+        metadata = json.load(f)
+
+    assert metadata["modules"]["mod1"]["executors"] == ["graph"]
+    assert metadata["modules"]["mod2"]["executors"] == ["aot"]
+    assert metadata["version"] == _GENERATED_VERSION
+
+
 if __name__ == "__main__":
-    tvm.testing.main()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))