diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py index 131f92a208298..0e922f06cb515 100644 --- a/apps/microtvm/arduino/template_project/microtvm_api_server.py +++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py @@ -214,14 +214,21 @@ def _template_model_header(self, source_dir, metadata): with open(source_dir / "model.h", "r") as f: model_h_template = Template(f.read()) - assert ( - metadata["style"] == "full-model" + all_module_names = [] + for name in metadata["modules"].keys(): + all_module_names.append(name) + + assert all( + metadata["modules"][mod_name]["style"] == "full-model" for mod_name in all_module_names ), "when generating AOT, expect only full-model Model Library Format" - template_values = { - "workspace_size_bytes": metadata["memory"]["functions"]["main"][0][ + workspace_size_bytes = 0 + for mod_name in all_module_names: + workspace_size_bytes += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][ "workspace_size_bytes" - ], + ] + template_values = { + "workspace_size_bytes": workspace_size_bytes, } with open(source_dir / "model.h", "w") as f: diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu index 22c372cc70b00..f04d8515b8dc2 100644 --- a/docker/Dockerfile.ci_gpu +++ b/docker/Dockerfile.ci_gpu @@ -139,6 +139,10 @@ COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh RUN bash /install/ubuntu_install_sccache.sh ENV PATH /opt/sccache:$PATH +# dnnl +COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh +RUN bash /install/ubuntu_install_dnnl.sh + # Environment variables ENV PATH=/usr/local/nvidia/bin:${PATH} ENV PATH=/usr/local/cuda/bin:${PATH} diff --git a/docker/clear-stale-images.sh b/docker/clear-stale-images.sh new file mode 100755 index 0000000000000..1e1e4b86a4d7b --- /dev/null +++ b/docker/clear-stale-images.sh @@ -0,0 +1,113 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Remove tvm-related docker images from the local system which +# are not used by the currently-checked-out branch in this git +# repository plus any linked worktrees. + +set -euo pipefail + +dry_run=0 +repositories=( "$(cd $(dirname "$0") && git rev-parse --show-toplevel)" ) +skip_confirm=0 +verbose=0 +while [ "${1+x}" == "x" ]; do + case "$1" in + --help|-h) + echo "usage: $0 [-n] [-v] [-y] [ ...]" + echo "" + echo "Remove tvm-related docker images from the local system which" + echo "are not used by the currently-checked-out branch in this git" + echo "repository plus any linked worktrees." + echo "" + echo 'This command should remove only docker images beginning with "tlcpack"' + echo "" + echo "Options:" + echo " -n Perform a dry-run and just print the docker rmi command" + echo " -v Verbosely list the images kept and why" + echo " -y Skip confirmation" + echo " Additional git repositories to consult." + exit 2 + ;; + -n) + dry_run=1 + ;; + -v) + verbose=1 + ;; + -y) + skip_confirm=1 + ;; + *) + repositories=( "${repositories[@]}" "$1" ) + ;; + esac + shift +done + +declare -a used_images +for r in "${repositories[@]}"; do + if [ -d "${r}/.git" ]; then + worktree="${r}" + else + worktree="$(cat "${r}/.git")" + fi + while read wt; do + d="${wt:9:${#wt}}" # strip "worktree " prefix + for img in $(cat "${d}/Jenkinsfile" | grep -E '^ci_[a-z]+ = ' | sed -E "s/ci_[a-z]+ = '([^\"]*)'/\1/"); do + used_images=( "${used_images[@]}" "${img}" ) + done + done < <(cd "${worktree}" && git worktree list --porcelain | grep '^worktree ') +done + +declare -a to_rm +while read image; do + if [ "${image}" == ":" ]; then + continue + fi + grep -qE "^tlcpack" < <(echo "$image") && is_tlcpack=1 || is_tlcpack=0 + if [ $is_tlcpack -eq 0 ]; then # non-tlcpack image + if [ $verbose -ne 0 ]; then + echo "skipping (non-tvm): $image" + fi + continue + fi + grep -q "$image" < <(echo "${used_images[@]}") && is_used=1 || is_used=0 + if [ $is_used -eq 1 ]; then # Image was found in used_images + if [ $verbose -ne 0 ]; then + echo "skipping (image used): $image" + fi + continue + fi + to_rm=( "${to_rm[@]}" "${image}" ) +done < <(docker images --format '{{.Repository}}:{{.Tag}}') + +docker_cmd=( docker rmi "${to_rm[@]}" ) +if [ ${dry_run} -ne 0 ]; then + echo "would run: ${docker_cmd[@]}" +else + if [ $skip_confirm -eq 0 ]; then + echo "will run: ${docker_cmd[@]}" + read -p "Proceed? [y/N] " proceed + if [ "${proceed-}" != "y" -a "${proceed-}" != "Y" ]; then + echo "Aborted." + exit 2 + fi + fi + "${docker_cmd[@]}" +fi diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst index 82b5c5d43f416..26989fb8e6a3b 100644 --- a/docs/contribute/pull_request.rst +++ b/docs/contribute/pull_request.rst @@ -113,6 +113,14 @@ each time (e.g. you can test a change in CPU and i386 while retaining incrementa # run the CPU build and drop into a shell in the container python tests/scripts/ci.py cpu --interactive +We regularly update our docker images and, over time, stale images may unnecessarily consume disk +space. You can remove stale images that aren't used in the presently checked-out branch plus any +other worktrees using the following command: + +.. code:: bash + docker/clear-stale-images.sh + +Consult the ``--help`` for more options. C++ (local) ^^^^^^^^^^^ diff --git a/include/tvm/meta_schedule/database.h b/include/tvm/meta_schedule/database.h index 37a315bf744e9..b22d8beddbabb 100644 --- a/include/tvm/meta_schedule/database.h +++ b/include/tvm/meta_schedule/database.h @@ -98,6 +98,9 @@ struct WorkloadEqual { } }; +/*! \brief The class of measure candidates. */ +class MeasureCandidate; + /*! \brief The class of tuning records. */ class TuningRecordNode : public runtime::Object { public: @@ -123,6 +126,9 @@ class TuningRecordNode : public runtime::Object { static constexpr const char* _type_key = "meta_schedule.TuningRecord"; TVM_DECLARE_FINAL_OBJECT_INFO(TuningRecordNode, runtime::Object); + /*! \brief Construct the measure candidate given the initial IR module and trace + * stored in the tuning record. */ + MeasureCandidate AsMeasureCandidate() const; /*! * \brief Export the tuning record to a JSON string. * \return An array containing the trace, running secs, serialized target, and @@ -187,6 +193,11 @@ class DatabaseNode : public runtime::Object { * \return An array of top K tuning records for the given workload. */ virtual Array GetTopK(const Workload& workload, int top_k) = 0; + /*! + * \brief Get all tuning records from the database. + * \return An Array of all the tuning records in the database. + */ + virtual Array GetAllTuningRecords() = 0; /*! * \brief Get the size of the database. * \return The size of the database. @@ -224,6 +235,11 @@ class PyDatabaseNode : public DatabaseNode { * \return An array of top K tuning records for the given workload. */ using FGetTopK = runtime::TypedPackedFunc(const Workload&, int)>; + /*! + * \brief The function type of `GetAllTuningRecords` method. + * \return An Array of all the tuning records in the database. + */ + using FGetAllTuningRecords = runtime::TypedPackedFunc()>; /*! * \brief The function type of `Size` method. * \return The size of the database. @@ -238,6 +254,8 @@ class PyDatabaseNode : public DatabaseNode { FCommitTuningRecord f_commit_tuning_record; /*! \brief The packed function to the `GetTopK` function. */ FGetTopK f_get_top_k; + /*! \brief The packed function to the `GetAllTuningRecords` function. */ + FGetAllTuningRecords f_get_all_tuning_records; /*! \brief The packed function to the `Size` function. */ FSize f_size; @@ -249,6 +267,7 @@ class PyDatabaseNode : public DatabaseNode { // `f_commit_workload` is not visited // `f_commit_tuning_record` is not visited // `f_get_top_k` is not visited + // `f_get_all_tuning_records` is not visited // `f_size` is not visited } @@ -273,6 +292,12 @@ class PyDatabaseNode : public DatabaseNode { return f_get_top_k(workload, top_k); } + Array GetAllTuningRecords() final { + ICHECK(f_get_all_tuning_records != nullptr) + << "PyDatabase's GetAllTuningRecords method not implemented!"; + return f_get_all_tuning_records(); + } + int64_t Size() final { ICHECK(f_size != nullptr) << "PyDatabase's Size method not implemented!"; return f_size(); @@ -302,6 +327,7 @@ class Database : public runtime::ObjectRef { * \param f_commit_workload The packed function of `CommitWorkload`. * \param f_commit_tuning_record The packed function of `CommitTuningRecord`. * \param f_get_top_k The packed function of `GetTopK`. + * \param f_get_all_tuning_records The packed function of `GetAllTuningRecords`. * \param f_size The packed function of `Size`. * \return The created database. */ @@ -309,6 +335,7 @@ class Database : public runtime::ObjectRef { PyDatabaseNode::FCommitWorkload f_commit_workload, PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record, PyDatabaseNode::FGetTopK f_get_top_k, + PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records, PyDatabaseNode::FSize f_size); TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS(Database, runtime::ObjectRef, DatabaseNode); }; diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py b/python/tvm/auto_scheduler/testing/tune_onnx.py index 8dc778e8f49ca..5fbc875d1eda9 100644 --- a/python/tvm/auto_scheduler/testing/tune_onnx.py +++ b/python/tvm/auto_scheduler/testing/tune_onnx.py @@ -28,6 +28,7 @@ from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.utils import cpu_count from tvm.relay.frontend import from_onnx +from tvm.support import describe def _parse_args(): @@ -148,6 +149,7 @@ def main(): else: raise NotImplementedError(f"Unsupported target {ARGS.target}") + describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py b/python/tvm/auto_scheduler/testing/tune_relay.py index f1a2b573a69be..58ea327ec50b6 100644 --- a/python/tvm/auto_scheduler/testing/tune_relay.py +++ b/python/tvm/auto_scheduler/testing/tune_relay.py @@ -146,6 +146,9 @@ def main(): ) else: raise NotImplementedError(f"Unsupported target {ARGS.target}") + + describe() + print(f"Workload: {ARGS.workload}") mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, @@ -153,7 +156,6 @@ def main(): ) input_info = {input_name: input_shape} input_data = {} - print(f"Workload: {ARGS.workload}") for input_name, input_shape in input_info.items(): print(f" input_name: {input_name}") print(f" input_shape: {input_shape}") diff --git a/python/tvm/auto_scheduler/testing/tune_te.py b/python/tvm/auto_scheduler/testing/tune_te.py index d640a32c29419..4a6874a53d347 100644 --- a/python/tvm/auto_scheduler/testing/tune_te.py +++ b/python/tvm/auto_scheduler/testing/tune_te.py @@ -91,6 +91,8 @@ def _parse_args(): def main(): + describe() + print(f"Workload: {ARGS.workload}") log_file = os.path.join(ARGS.work_dir, f"{ARGS.workload}.json") workload_func, params = CONFIGS[ARGS.workload] params = params[0] # type: ignore diff --git a/python/tvm/contrib/pipeline_executor.py b/python/tvm/contrib/pipeline_executor.py index a50fffaa2b434..5ef309bb28080 100644 --- a/python/tvm/contrib/pipeline_executor.py +++ b/python/tvm/contrib/pipeline_executor.py @@ -55,6 +55,7 @@ def __init__(self, module): self._get_input = self.module["get_input"] self._get_output = self.module["get_output"] self._get_num_outputs = self.module["get_num_outputs"] + self._get_num_inputs = self.module["get_num_inputs"] self._get_input_pipeline_map = self.module["get_input_pipeline_map"] self._get_pipe_execute_count = self.module["get_execute_count"] @@ -159,6 +160,16 @@ def num_outputs(self): """ return self._get_num_outputs() + @property + def num_inputs(self): + """Get the number of inputs + Returns + ------- + count : int + The number of inputs + """ + return self._get_num_inputs() + @staticmethod def load_library(config_file_name): """Import files to create a pipeline executor. diff --git a/python/tvm/driver/tvmc/model.py b/python/tvm/driver/tvmc/model.py index 04946ec9c6d0c..5f40d21223125 100644 --- a/python/tvm/driver/tvmc/model.py +++ b/python/tvm/driver/tvmc/model.py @@ -391,9 +391,20 @@ def import_package(self, package_path: str): with open(temp.relpath("metadata.json")) as metadata_json: metadata = json.load(metadata_json) - has_graph_executor = "graph" in metadata["executors"] - graph = temp.relpath("executor-config/graph/graph.json") if has_graph_executor else None - params = temp.relpath(f'parameters/{metadata["model_name"]}.params') + all_module_names = [] + for name in metadata["modules"].keys(): + all_module_names.append(name) + assert len(all_module_names) == 1, "Multiple modules in MLF is not supported." + + module_name = all_module_names[0] + module_metdata = metadata["modules"][module_name] + has_graph_executor = "graph" in module_metdata["executors"] + graph = ( + temp.relpath(f"executor-config/graph/{module_name}.graph") + if has_graph_executor + else None + ) + params = temp.relpath(f"parameters/{module_name}.params") self.type = "mlf" else: diff --git a/python/tvm/meta_schedule/database/database.py b/python/tvm/meta_schedule/database/database.py index 802a739e69582..0c11f77591cc6 100644 --- a/python/tvm/meta_schedule/database/database.py +++ b/python/tvm/meta_schedule/database/database.py @@ -115,6 +115,17 @@ def __init__( # type: ignore # pylint: disable=too-many-arguments args_info, ) + def as_measure_candidate(self) -> Any: + """Generate a measure candidate given an initial IR module and a trace + stored in the tuning record. + + Returns + ------- + candidate : MeasureCandidate + A generated candidate. + """ + return _ffi_api.TuningRecordAsMeasureCandidate(self) # type: ignore # pylint: disable=no-member + def as_json(self) -> Any: """Export the tuning record to a JSON string. @@ -203,6 +214,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]: """ return _ffi_api.DatabaseGetTopK(self, workload, top_k) # type: ignore # pylint: disable=no-member + def get_all_tuning_records(self) -> List[TuningRecord]: + """Get all the tuning records from the database. + + Returns + ------- + tuning_records : List[TuningRecord] + All tuning records from the database. + """ + return _ffi_api.DatabaseGetAllTuningRecords(self) # type: ignore # pylint: disable=no-member + def __len__(self) -> int: """Get the number of records in the database. @@ -229,6 +250,7 @@ def __init__( f_commit_workload: Callable = None, f_commit_tuning_record: Callable = None, f_get_top_k: Callable = None, + f_get_all_tuning_records: Callable = None, f_size: Callable = None, ): """Constructor.""" @@ -239,6 +261,7 @@ def __init__( f_commit_workload, f_commit_tuning_record, f_get_top_k, + f_get_all_tuning_records, f_size, ) @@ -258,6 +281,7 @@ class PyDatabase: "commit_workload", "commit_tuning_record", "get_top_k", + "get_all_tuning_records", "__len__", ], } @@ -317,6 +341,16 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]: """ raise NotImplementedError + def get_all_tuning_records(self) -> List[TuningRecord]: + """Get all the tuning records from the database. + + Returns + ------- + tuning_records : List[TuningRecord] + All tuning records from the database. + """ + raise NotImplementedError + def __len__(self) -> int: """Get the number of records in the database. diff --git a/python/tvm/meta_schedule/database/memory_database.py b/python/tvm/meta_schedule/database/memory_database.py index 6d10e4b5272a9..95d937cc77aad 100644 --- a/python/tvm/meta_schedule/database/memory_database.py +++ b/python/tvm/meta_schedule/database/memory_database.py @@ -56,6 +56,9 @@ def get_top_k(self, workload: Workload, top_k: int) -> List[TuningRecord]: ) )[: int(top_k)] + def get_all_tuning_records(self) -> List[TuningRecord]: + return self.records + def __len__(self) -> int: return len(self.records) diff --git a/python/tvm/meta_schedule/runner/config.py b/python/tvm/meta_schedule/runner/config.py index 585b88ed9939c..e3b5364dee22d 100644 --- a/python/tvm/meta_schedule/runner/config.py +++ b/python/tvm/meta_schedule/runner/config.py @@ -28,9 +28,14 @@ class EvaluatorConfig(NamedTuple): Parameters ---------- number: int - The number of runs. + The number of times to run this function for taking average. + We call these runs as one `repeat` of measurement. repeat: int - The number of times to repeat in each run. + The number of times to repeat the measurement. + In total, the function will be invoked (1 + number x repeat) times, + where the first one is warm up and will be discarded. + The returned result contains `repeat` costs, + each of which is an average of `number` costs. min_repeat_ms: int Minimum repeat time in ms. if the execution latency is too short, increase the number of runs to the given time (in ms) to reduce the measurement error. diff --git a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py index c80d78173e2e4..35b872e7351e8 100644 --- a/python/tvm/meta_schedule/testing/dataset_sample_candidates.py +++ b/python/tvm/meta_schedule/testing/dataset_sample_candidates.py @@ -103,6 +103,14 @@ def sample_candidates(task, task_name, model_name): ------- None """ + candidate_path = os.path.join( + args.candidate_cache_dir, model_name, task_name + "_candidates.json" + ) + workload_path = os.path.join(args.candidate_cache_dir, model_name, task_name + "_workload.json") + database = ms.database.JSONDatabase( + path_workload=workload_path, + path_tuning_record=candidate_path, + ) sample_init_population = tvm.get_global_func( "meta_schedule.SearchStrategyEvolutionarySearchSampleInitPopulation" ) @@ -128,7 +136,7 @@ def sample_candidates(task, task_name, model_name): context.initialize() context.pre_tuning( context.generate_design_space(), - database=ms.database.MemoryDatabase(), # type: ignore + database=database, cost_model=ms.cost_model.RandomModel(), # type: ignore ) @@ -148,16 +156,9 @@ def sample_candidates(task, task_name, model_name): all_states = all_states[: args.num_samples_per_task] workload = ms.database.Workload(context.mod) - file_path = os.path.join(args.candidate_cache_dir, model_name, task_name + ".json") - with open(file_path, "w", encoding="utf8") as file: - for i, state in enumerate(all_states): - tuning_record = ms.database.TuningRecord(state.trace, workload) - json_str = json.dumps(tuning_record.as_json()) - assert "\n" not in json_str, "Failed to generate single line string." - if i == len(all_states) - 1: - file.write(json_str) - else: - file.write(json_str + "\n") + database.commit_workload(context.mod) + for state in all_states: + database.commit_tuning_record(ms.database.TuningRecord(state.trace, workload)) args = _parse_args() # pylint: disable=invalid-name diff --git a/python/tvm/meta_schedule/testing/distributed_measure_candidates.py b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py new file mode 100644 index 0000000000000..8e646c4846724 --- /dev/null +++ b/python/tvm/meta_schedule/testing/distributed_measure_candidates.py @@ -0,0 +1,198 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-docstring + +import argparse +import glob +import os + +from tqdm import tqdm # type: ignore +from tvm import meta_schedule as ms +from tvm.target import Target + + +def _parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--candidate_cache_dir", type=str, help="Please provide the full path to the candidates." + ) + parser.add_argument( + "--result_cache_dir", type=str, help="Please provide the full path to the result database." + ) + parser.add_argument( + "--target", + type=str, + default="nvidia/nvidia-v100", + help="Please specify the target hardware for tuning context.", + ) + parser.add_argument( + "--rpc_host", type=str, help="Please provide the private IPv4 address for the tracker." + ) + parser.add_argument( + "--rpc_port", type=int, default=4445, help="Please provide the port for the tracker." + ) + parser.add_argument( + "--rpc_key", + type=str, + default="p3.2xlarge", + help="Please provide the key for the rpc servers.", + ) + parser.add_argument( + "--builder_timeout_sec", + type=int, + default=10, + help="The time for the builder session to time out.", + ) + parser.add_argument( + "--min_repeat_ms", type=int, default=100, help="The time for preheating the gpu." + ) + parser.add_argument( + "--runner_timeout_sec", + type=int, + default=100, + help="The time for the runner session to time out.", + ) + parser.add_argument( + "--cpu_flush", type=bool, default=False, help="Whether to enable cpu cache flush or not." + ) + parser.add_argument( + "--batch_size", + type=int, + default=128, + help="The batch size of candidates sent to builder and runner each time.", + ) + return parser.parse_args() + + +# pylint: disable=too-many-locals +def measure_candidates(database, builder, runner): + """Send the candidates to builder and runner for distributed measurement, + and save the results in a new json database. + + Parameters + ---------- + database : JSONDatabase + The database for candidates to be measured. + builder : Builder + The builder for building the candidates. + runner : Runner + The runner for measuring the candidates. + + Returns + ------- + None + """ + candidates, runner_results, build_fail_indices, run_fail_indices = [], [], [], [] + context = ms.TuneContext(target=Target(args.target)) + tuning_records = database.get_all_tuning_records() + for record in tuning_records: + candidates.append(record.as_measure_candidate()) + with ms.Profiler() as profiler: + for idx in range(0, len(candidates), args.batch_size): + batch_candidates = candidates[idx : idx + args.batch_size] + context._set_measure_candidates(batch_candidates) # pylint: disable=protected-access + with ms.Profiler.timeit("build"): + context._send_to_builder(builder) # pylint: disable=protected-access + with ms.Profiler.timeit("run"): + context._send_to_runner(runner) # pylint: disable=protected-access + batch_runner_results = context._join() # pylint: disable=protected-access + runner_results.extend(batch_runner_results) + for i, result in enumerate(context.builder_results): + if result.error_msg is None: + ms.utils.remove_build_dir(result.artifact_path) + else: + build_fail_indices.append(i + idx) + context._clear_measure_state() # pylint: disable=protected-access + + model_name, workload_name = database.path_workload.split("/")[-2:] + record_name = database.path_tuning_record.split("/")[-1] + new_database = ms.database.JSONDatabase( + path_workload=os.path.join(args.result_cache_dir, model_name, workload_name), + path_tuning_record=os.path.join(args.result_cache_dir, model_name, record_name), + ) + workload = tuning_records[0].workload + new_database.commit_workload(workload.mod) + for i, (record, result) in enumerate(zip(tuning_records, runner_results)): + if result.error_msg is None: + new_database.commit_tuning_record( + ms.database.TuningRecord( + trace=record.trace, + workload=workload, + run_secs=[v.value for v in result.run_secs], + target=Target(args.target), + ) + ) + else: + run_fail_indices.append(i) + fail_indices_name = workload_name.replace("_workload.json", "_failed_indices.txt") + with open( + os.path.join(args.result_cache_dir, model_name, fail_indices_name), "w", encoding="utf8" + ) as file: + file.write(" ".join([str(n) for n in run_fail_indices])) + print( + f"Builder time: {profiler.get()['build']}, Runner time: {profiler.get()['run']}\n\ + Failed number of builds: {len(build_fail_indices)},\ + Failed number of runs: {len(run_fail_indices)}" + ) + + +args = _parse_args() # pylint: disable=invalid-name + + +def main(): + builder = ms.builder.LocalBuilder(timeout_sec=args.builder_timeout_sec) + runner = ms.runner.RPCRunner( + rpc_config=ms.runner.RPCConfig( + tracker_host=args.rpc_host, + tracker_port=args.rpc_port, + tracker_key=args.rpc_key, + session_timeout_sec=args.runner_timeout_sec, + ), + evaluator_config=ms.runner.EvaluatorConfig( + number=3, + repeat=1, + min_repeat_ms=args.min_repeat_ms, + enable_cpu_cache_flush=args.cpu_flush, + ), + max_workers=os.cpu_count(), + ) + if not os.path.isdir(args.candidate_cache_dir): + raise Exception("Please provide a correct candidate cache dir.") + try: + os.makedirs(args.result_cache_dir, exist_ok=True) + except OSError: + print(f"Directory {args.result_cache_dir} cannot be created successfully.") + model_dirs = glob.glob(os.path.join(args.candidate_cache_dir, "*")) + for model_dir in model_dirs: + model_name = model_dir.split("/")[-1] + os.makedirs(os.path.join(args.result_cache_dir, model_name), exist_ok=True) + all_tasks = glob.glob(os.path.join(model_dir, "*.json")) + workload_paths = [] + for path in all_tasks: + if path.endswith("_workload.json"): + workload_paths.append(path) + for workload_path in tqdm(workload_paths): + candidate_path = workload_path.replace("_workload.json", "_candidates.json") + database = ms.database.JSONDatabase( + path_workload=workload_path, + path_tuning_record=candidate_path, + ) + measure_candidates(database, builder, runner) + + +if __name__ == "__main__": + main() diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py b/python/tvm/meta_schedule/testing/tune_onnx.py index 2409041c31760..88cb360c0171b 100644 --- a/python/tvm/meta_schedule/testing/tune_onnx.py +++ b/python/tvm/meta_schedule/testing/tune_onnx.py @@ -25,6 +25,7 @@ from tvm import meta_schedule as ms from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.relay.frontend import from_onnx +from tvm.support import describe def _parse_args(): @@ -115,6 +116,7 @@ def _parse_args(): def main(): + describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} diff --git a/python/tvm/meta_schedule/testing/tune_relay.py b/python/tvm/meta_schedule/testing/tune_relay.py index ee1eec2aa0558..ce15c60c15e68 100644 --- a/python/tvm/meta_schedule/testing/tune_relay.py +++ b/python/tvm/meta_schedule/testing/tune_relay.py @@ -24,6 +24,7 @@ from tvm import meta_schedule as ms from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc from tvm.meta_schedule.testing.relay_workload import get_network +from tvm.support import describe def _parse_args(): @@ -113,6 +114,8 @@ def _parse_args(): def main(): + describe() + print(f"Workload: {ARGS.workload}") mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, @@ -120,7 +123,6 @@ def main(): ) input_info = {input_name: input_shape} input_data = {} - print(f"Workload: {ARGS.workload}") for input_name, input_shape in input_info.items(): print(f" input_name: {input_name}") print(f" input_shape: {input_shape}") diff --git a/python/tvm/meta_schedule/testing/tune_te.py b/python/tvm/meta_schedule/testing/tune_te.py index 8caa023036fa7..8740d74424781 100644 --- a/python/tvm/meta_schedule/testing/tune_te.py +++ b/python/tvm/meta_schedule/testing/tune_te.py @@ -24,6 +24,7 @@ from tvm import meta_schedule as ms from tvm import tir from tvm.meta_schedule.testing.te_workload import create_te_workload +from tvm.support import describe def _parse_args(): @@ -102,6 +103,8 @@ def _parse_args(): def main(): + describe() + print(f"Workload: {ARGS.workload}") runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( diff --git a/python/tvm/meta_schedule/tune_context.py b/python/tvm/meta_schedule/tune_context.py index b7975e7b2c4e9..30c726ded25bb 100644 --- a/python/tvm/meta_schedule/tune_context.py +++ b/python/tvm/meta_schedule/tune_context.py @@ -171,6 +171,50 @@ def __init__( ) _ffi_api.TuneContextInitialize(self) # type: ignore # pylint: disable=no-member + def _set_measure_candidates(self, candidates): + """Set candidates in a tuning context. + + Parameters + ---------- + candidates : List[MeasureCandidate] + A list of measure candidates for the tuning context. + """ + _ffi_api.TuneContextSetMeasureCandidates(self, candidates) # type: ignore # pylint: disable=no-member + + def _send_to_builder(self, builder): + """Send candidates to builder. + + Parameters + ---------- + builder : Builder + The builder for building the candidates. + """ + _ffi_api.TuneContextSendToBuilder(self, builder) # type: ignore # pylint: disable=no-member + + def _send_to_runner(self, runner): + """Send candidates to runner. + + Parameters + ---------- + runner : Runner + The runner for running the candidates. + """ + _ffi_api.TuneContextSendToRunner(self, runner) # type: ignore # pylint: disable=no-member + + def _join(self): + """Join the runner processes. + + Returns + ------- + result : List[RunnerResult] + The runner results. + """ + return _ffi_api.TuneContextJoin(self) # type: ignore # pylint: disable=no-member + + def _clear_measure_state(self): + """Clear the measure states.""" + _ffi_api.TuneContextClearMeasureState(self) # type: ignore # pylint: disable=no-member + def generate_design_space(self) -> List[Schedule]: """Generate design spaces given a module. diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py index aec5912871fd5..814f98f1b7882 100644 --- a/python/tvm/micro/contrib/stm32/emitter.py +++ b/python/tvm/micro/contrib/stm32/emitter.py @@ -482,8 +482,18 @@ def parse_library_format(self, model_library_format_path, quantization=None): with tarfile.TarFile(model_library_format_path) as f: f.extractall(extract_path) + with open(os.path.join(extract_path, "metadata.json")) as metadata_f: + metadata = json.load(metadata_f) + + all_module_names = [] + for name in metadata["modules"].keys(): + all_module_names.append(name) + assert len(metadata["modules"]) == 1, "Multiple modules is not supported." + # Extract informations from the Model Library Format - graph_file = os.path.join(extract_path, "executor-config", "graph", "graph.json") + graph_file = os.path.join( + extract_path, "executor-config", "graph", f"{all_module_names[0]}.graph" + ) with open(graph_file, "r") as f: # returns JSON object as a dictionary graph_dict = json.load(f) diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py index 1dd63b319dbd5..e220fa1ca5430 100644 --- a/python/tvm/micro/model_library_format.py +++ b/python/tvm/micro/model_library_format.py @@ -39,6 +39,7 @@ # This should be kept identical to runtime::symbol::tvm_module_main MAIN_FUNC_NAME_STR = "__tvm_main__" STANDALONE_CRT_URL = "./runtime" +METADATA_FILE = "metadata.json" class UnsupportedInModelLibraryFormatError(Exception): @@ -67,56 +68,78 @@ def generate_c_interface_header( EPHEMERAL_MODULE_TYPE_KEYS = ("metadata_module",) -def _populate_codegen_dir(mod, codegen_dir: str, module_name: str = None): +def _populate_codegen_dir( + mods: typing.Union[ + typing.List[executor_factory.ExecutorFactoryModule], + typing.List[tvm.runtime.Module], + ], + codegen_dir: str, +): """Populate the codegen sub-directory as part of a Model Library Format export. Parameters ---------- - mod : tvm.runtime.Module - Module which should be written to codegen_dir. + mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule], List[tvm.runtime.Module] + A list of the return value of tvm.relay.build, which + will be exported into Model Library Format. codegen_dir : str Path to the codegen directory on disk. module_name: Optional[str] Name used to prefix the generated source files """ - dso_modules = mod._collect_dso_modules() - non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules) + dso_modules = [] + for mod in mods: + if isinstance(mod, executor_factory.ExecutorFactoryModule): + lib = mod.lib + elif isinstance(mod, tvm.runtime.Module): + lib = mod + else: + raise RuntimeError(f"Not supported module type: {type(mod)}") - # Filter ephemeral modules which cannot be exported. - dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS] - non_dso_modules = [m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS] + dso_modules = lib._collect_dso_modules() + non_dso_modules = lib._collect_from_import_tree(lambda m: m not in dso_modules) - if non_dso_modules: - raise UnsupportedInModelLibraryFormatError( - f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}" - ) + # Filter ephemeral modules which cannot be exported. + dso_modules = [m for m in dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS] + non_dso_modules = [ + m for m in non_dso_modules if m.type_key not in EPHEMERAL_MODULE_TYPE_KEYS + ] - mod_indices = {"lib": 0, "src": 0} - host_codegen_dir = os.path.join(codegen_dir, "host") - lib_name = f"{module_name}_lib" if module_name else "lib" - - for dso_mod in dso_modules: - if dso_mod.type_key == "c": - assert dso_mod.format in ["c", "cc", "cpp"] - ext = dso_mod.format - index = mod_indices["src"] - mod_indices["src"] += 1 - parent_dir = os.path.join(host_codegen_dir, "src") - file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}") - elif dso_mod.type_key == "llvm": - index = mod_indices["lib"] - mod_indices["lib"] += 1 - parent_dir = os.path.join(host_codegen_dir, "lib") - file_name = os.path.join(parent_dir, f"{lib_name}{index}.o") - else: - assert ( - False - ), f"do not expect module with type_key={mod.type_key} from _collect_dso_modules" + if non_dso_modules: + raise UnsupportedInModelLibraryFormatError( + f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}" + ) + + mod_indices = {"lib": 0, "src": 0} + host_codegen_dir = os.path.join(codegen_dir, "host") + lib_name = ( + f"{mod.libmod_name}_lib" + if isinstance(mod, executor_factory.ExecutorFactoryModule) + else "lib" + ) - if not os.path.exists(parent_dir): - os.makedirs(parent_dir) - dso_mod.save(file_name) + for dso_mod in dso_modules: + if dso_mod.type_key == "c": + assert dso_mod.format in ["c", "cc", "cpp"] + ext = dso_mod.format + index = mod_indices["src"] + mod_indices["src"] += 1 + parent_dir = os.path.join(host_codegen_dir, "src") + file_name = os.path.join(parent_dir, f"{lib_name}{index}.{ext}") + elif dso_mod.type_key == "llvm": + index = mod_indices["lib"] + mod_indices["lib"] += 1 + parent_dir = os.path.join(host_codegen_dir, "lib") + file_name = os.path.join(parent_dir, f"{lib_name}{index}.o") + else: + assert ( + False + ), f"do not expect module with type_key={lib.type_key} from _collect_dso_modules" + + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + dso_mod.save(file_name) def _build_memory_map(mod): @@ -297,7 +320,7 @@ def _should_generate_interface_header(mod): return "interface-api" in mod.executor and mod.executor["interface-api"] == "c" -def _make_tar(source_dir, tar_file_path, mod): +def _make_tar(source_dir, tar_file_path, modules): """Build a tar file from source_dir.""" with tarfile.open(tar_file_path, "w") as tar_f: @@ -307,91 +330,127 @@ def reset(tarinfo): return tarinfo tar_f.add(str(source_dir), arcname=".", filter=reset) - is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule) - if is_aot and str(mod.runtime) == "crt": - tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL) + + for mod in modules: + is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule) + if is_aot and str(mod.runtime) == "crt": + tar_f.add(get_standalone_crt_dir(), arcname=STANDALONE_CRT_URL) + break -_GENERATED_VERSION = 6 +_GENERATED_VERSION = 7 + + +def _is_module_names_unique(mods: typing.List[executor_factory.ExecutorFactoryModule]): + """Check if built modules have unique names. + + Parameters + ---------- + mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule] + A list of the return value of tvm.relay.build, + which will be exported into Model Library Format. + """ + all_names = [] + for mod in mods: + all_names.append(mod.libmod_name) + + return len(set(all_names)) == len(all_names) def _export_graph_model_library_format( - mod: executor_factory.ExecutorFactoryModule, tempdir: pathlib.Path + mods: typing.List[executor_factory.ExecutorFactoryModule], tempdir: pathlib.Path ): """Export a tvm.relay.build artifact in Model Library Format. Parameters ---------- - mod : tvm.relay.backend.executor_factory.ExecutorFactoryModule - The return value of tvm.relay.build, which will be exported into Model Library Format. + mods : List[tvm.relay.backend.executor_factory.ExecutorFactoryModule] + A list of the return value of tvm.relay.build, + which will be exported into Model Library Format. tempdir : pathlib.Path Temporary directory to populate with Model Library Format contents. """ - is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule) - executor = ["aot"] if is_aot else ["graph"] + + assert _is_module_names_unique(mods), "Multiple modules should have unique names." metadata = { "version": _GENERATED_VERSION, - "model_name": mod.libmod_name, - "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"), - "memory": _build_memory_map(mod), - "target": [str(t) for t in mod.target], - "executors": executor, - "style": "full-model", } - - if is_aot and (str(mod.runtime) == "crt"): - standalone_crt = { - "short_name": "tvm_standalone_crt", - "url": f"{STANDALONE_CRT_URL}", - "url_type": "mlf_path", - "version_spec": f"{tvm.__version__}", + metadata["modules"] = {} + for mod in mods: + is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule) + executor = ["aot"] if is_aot else ["graph"] + module_name = mod.libmod_name + metadata["modules"][module_name] = { + "model_name": module_name, + "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"), + "memory": _build_memory_map(mod), + "target": [str(t) for t in mod.target], + "executors": executor, + "style": "full-model", } - external_dependencies = [standalone_crt] - metadata["external_dependencies"] = external_dependencies - with open(tempdir / "metadata.json", "w") as json_f: + if is_aot and (str(mod.runtime) == "crt"): + standalone_crt = { + "short_name": "tvm_standalone_crt", + "url": f"{STANDALONE_CRT_URL}", + "url_type": "mlf_path", + "version_spec": f"{tvm.__version__}", + } + external_dependencies = [standalone_crt] + metadata["modules"][module_name]["external_dependencies"] = external_dependencies + + with open(tempdir / METADATA_FILE, "w") as json_f: json.dump(metadata, json_f, indent=2, sort_keys=True) codegen_dir = tempdir / "codegen" codegen_dir.mkdir() - _populate_codegen_dir(mod.lib, codegen_dir, mod.libmod_name) - - if _should_generate_interface_header(mod): - include_path = codegen_dir / "host" / "include" - include_path.mkdir() - inputs, outputs = _get_inputs_and_outputs_from_module(mod) - devices = mod.get_devices() - pools = _get_pools_from_module(mod) - io_pool_allocations = _get_io_pool_allocation_from_module(mod) - workspace_size = int(metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]) - generate_c_interface_header( - mod.libmod_name, - inputs, - outputs, - pools, - io_pool_allocations, - devices, - workspace_size, - include_path, - ) + _populate_codegen_dir(mods, codegen_dir) parameters_dir = tempdir / "parameters" parameters_dir.mkdir() - param_filename = parameters_dir / f"{mod.libmod_name}.params" - with open(param_filename, "wb") as f: - f.write(param_dict.save_param_dict(mod.params)) - src_dir = tempdir / "src" src_dir.mkdir() - with open(src_dir / "relay.txt", "w") as f: - f.write(str(mod.ir_mod)) + graph_config_dir = tempdir / "executor-config" / "graph" + for mod in mods: + if _should_generate_interface_header(mod): + include_path = codegen_dir / "host" / "include" + if not include_path.exists(): + include_path.mkdir() + + inputs, outputs = _get_inputs_and_outputs_from_module(mod) + devices = mod.get_devices() + pools = _get_pools_from_module(mod) + io_pool_allocations = _get_io_pool_allocation_from_module(mod) + workspace_size = int( + metadata["modules"][mod.libmod_name]["memory"]["functions"]["main"][0][ + "workspace_size_bytes" + ] + ) + generate_c_interface_header( + mod.libmod_name, + inputs, + outputs, + pools, + io_pool_allocations, + devices, + workspace_size, + include_path, + ) + + is_aot = isinstance(mod, executor_factory.AOTExecutorFactoryModule) + param_filename = parameters_dir / f"{mod.libmod_name}.params" + with open(param_filename, "wb") as f: + f.write(param_dict.save_param_dict(mod.params)) - if not is_aot: - graph_config_dir = tempdir / "executor-config" / "graph" - graph_config_dir.mkdir(parents=True) - with open(graph_config_dir / "graph.json", "w") as f: - f.write(mod.get_executor_config()) + with open(src_dir / f"{mod.libmod_name}.relay", "w") as f: + f.write(str(mod.ir_mod)) + + if not is_aot: + if not graph_config_dir.exists(): + graph_config_dir.mkdir(parents=True) + with open(graph_config_dir / f"{mod.libmod_name}.graph", "w") as f: + f.write(mod.get_executor_config()) class NonStaticShapeError(Exception): @@ -451,14 +510,11 @@ def _eval_shape(param_name, buffer_shape): def _export_operator_model_library_format(mod: build_module.OperatorModule, tempdir): """Export the result of tvm.build() in Model Library Format. - Parameters ---------- mod : runtime.Module The Module returned from tvm.build(). - args : list of Buffer or Tensor or Var, optional - The args supplied to tvm.build(). - file_name : str + tempdir : str Path to the .tar archive to generate. """ targets = [] @@ -484,12 +540,12 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp "executors": [], "style": "operator", } - with open(tempdir / "metadata.json", "w") as metadata_f: + with open(tempdir / METADATA_FILE, "w") as metadata_f: json.dump(metadata, metadata_f) codegen_dir = tempdir / "codegen" codegen_dir.mkdir() - _populate_codegen_dir(mod, codegen_dir) + _populate_codegen_dir(list([mod]), codegen_dir) ExportableModule = typing.Union[ @@ -499,7 +555,10 @@ def _export_operator_model_library_format(mod: build_module.OperatorModule, temp ] -def export_model_library_format(mod: ExportableModule, file_name: typing.Union[str, pathlib.Path]): +def export_model_library_format( + mods: typing.Union[ExportableModule, typing.List[ExportableModule]], + file_name: typing.Union[str, pathlib.Path], +): """Export the build artifact in Model Library Format. This function creates a .tar archive containing the build artifacts in a standardized @@ -508,7 +567,7 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s Parameters ---------- - mod : ExportableModule + mod : ExportableModule, List[ExportableModule] The return value of tvm.build or tvm.relay.build. file_name : str Path to the .tar archive to generate. @@ -518,20 +577,36 @@ def export_model_library_format(mod: ExportableModule, file_name: typing.Union[s file_name : str The path to the generated .tar archive. """ - file_name = pathlib.Path(file_name) + modules = mods + if not isinstance(mods, list): + modules = list([mods]) + + operator_module_type = all(isinstance(mod, build_module.OperatorModule) for mod in modules) + graph_module_type = all( + isinstance( + mod, + ( + executor_factory.AOTExecutorFactoryModule, + executor_factory.GraphExecutorFactoryModule, + ), + ) + for mod in modules + ) + file_name = pathlib.Path(file_name) tempdir = utils.tempdir() - if isinstance(mod, build_module.OperatorModule): - _export_operator_model_library_format(mod, tempdir.path) - elif isinstance( - mod, - (executor_factory.AOTExecutorFactoryModule, executor_factory.GraphExecutorFactoryModule), - ): - _export_graph_model_library_format(mod, tempdir.path) + if operator_module_type: + if len(modules) != 1: + raise RuntimeError("Multiple operator is not supported.") + _export_operator_model_library_format(modules[0], tempdir.path) + elif graph_module_type: + _export_graph_model_library_format(modules, tempdir.path) else: - raise NotImplementedError(f"Don't know how to export module of type {mod.__class__!r}") + raise NotImplementedError( + f"Don't know how to export module of type {modules[0].__class__!r}" + ) - _make_tar(tempdir.path, file_name, mod) + _make_tar(tempdir.path, file_name, modules) return file_name diff --git a/python/tvm/micro/testing/utils.py b/python/tvm/micro/testing/utils.py index 81e29a92a86a1..a48c8dc3230fb 100644 --- a/python/tvm/micro/testing/utils.py +++ b/python/tvm/micro/testing/utils.py @@ -24,6 +24,8 @@ import time from typing import Union +import tvm +from tvm import relay from tvm.micro.project_api.server import IoTimeoutError # Timeout in seconds for AOT transport. @@ -77,9 +79,36 @@ def _read_line(transport, timeout_sec: int) -> str: def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[pathlib.Path, str]) -> int: """Extract an MLF archive file and read workspace size from metadata file.""" + workspace_size = 0 with tarfile.open(mlf_tar_path, "r:*") as tar_file: tar_members = [ti.name for ti in tar_file.getmembers()] assert "./metadata.json" in tar_members with tar_file.extractfile("./metadata.json") as f: metadata = json.load(f) - return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"] + for mod_name in metadata["modules"].keys(): + workspace_size += metadata["modules"][mod_name]["memory"]["functions"]["main"][0][ + "workspace_size_bytes" + ] + return workspace_size + + +def get_conv2d_relay_module(): + """Generate a conv2d Relay module for testing.""" + data_shape = (1, 3, 64, 64) + weight_shape = (8, 3, 5, 5) + data = relay.var("data", relay.TensorType(data_shape, "int8")) + weight = relay.var("weight", relay.TensorType(weight_shape, "int8")) + y = relay.nn.conv2d( + data, + weight, + padding=(2, 2), + channels=8, + kernel_size=(5, 5), + data_layout="NCHW", + kernel_layout="OIHW", + out_dtype="int32", + ) + f = relay.Function([data, weight], y) + mod = tvm.IRModule.from_expr(f) + mod = relay.transform.InferType()(mod) + return mod diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py index 09831929e5277..8d714b7269d9c 100644 --- a/python/tvm/relay/op/contrib/cmsisnn.py +++ b/python/tvm/relay/op/contrib/cmsisnn.py @@ -223,7 +223,6 @@ def qnn_max_pool2d_pattern(): def check_qnn_max_pool2d(pattern): """Check if max pool2d is supported by CMSIS-NN.""" output = pattern - input_op = None if str(pattern.op.name) == "clip": pooling = pattern.args[0] diff --git a/python/tvm/relay/op/contrib/dnnl.py b/python/tvm/relay/op/contrib/dnnl.py index 6581f10a2f568..c251b66bfbc77 100644 --- a/python/tvm/relay/op/contrib/dnnl.py +++ b/python/tvm/relay/op/contrib/dnnl.py @@ -39,13 +39,14 @@ from tvm.relay import transform from tvm.relay.expr import GlobalVar from tvm.relay.expr_functor import ExprMutator, ExprVisitor +from tvm.relay.expr import const from tvm.relay.analysis import analysis as _analysis from tvm.relay import expr as _expr from ... import _ffi_api -from ...dataflow_pattern import wildcard, is_op, is_expr, rewrite, DFPatternCallback +from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr, rewrite, DFPatternCallback from .register import register_pattern_table @@ -56,8 +57,8 @@ def _register_external_op_helper(op_name, supported=True): """The helper function to indicate that a given operator can be supported by DNNL. - Paramters - --------- + Parameters + ---------- op_name : Str The name of operator that will be registered. @@ -69,6 +70,10 @@ def _register_external_op_helper(op_name, supported=True): @tvm.ir.register_op_attr(op_name, "target.dnnl") def _func_wrapper(expr): + args = expr.args + if any([x.checked_type.dtype == "int64" for x in args]): + logger.info("DNNL does not support int64.") + return False return supported return _func_wrapper @@ -90,6 +95,7 @@ def _func_wrapper(expr): _register_external_op_helper("exp") _register_external_op_helper("log") _register_external_op_helper("sqrt") +_register_external_op_helper("round") _register_external_op_helper("nn.relu") _register_external_op_helper("nn.leaky_relu") _register_external_op_helper("tanh") @@ -199,6 +205,70 @@ def make_dnnl_pattern(op_name, with_bias, with_eltwise): return dnnl_pattern +def make_qnn_conv2d_pattern(): + """Make qnn.conv2d based pattern supported by DNNL + + Returns + ------- + pattern : Tuple(pattern_name, CallPattern) + Created pattern name, along with its CallPattern. + """ + data = wildcard() + weight = is_constant() + bias = is_constant() + o_scl = is_constant() + dst_zp = is_constant() + act_scl = is_constant() + sum_scl = is_constant() + sum_src = wildcard() + + zero_zp = is_expr(const(0, dtype="int32")) + + pat = is_op("qnn.conv2d")(data, weight, zero_zp, zero_zp, is_constant(), is_constant()) + pat = is_op("cast")(pat) + pat = is_op("add")(pat, bias) | pat # optional bias + pat = is_op("multiply")(pat, o_scl) + pat = is_op("clip")(pat) # TBD, not only clip + pat = is_op("multiply")(pat, act_scl) | pat # optional multiply. Ex: act_scl == 1 + pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat # optional sum + pat = is_op("add")(pat, dst_zp) | pat # optional dst_zp, can be dst_zp == 0 + pat = is_op("cast")(pat) + + return "dnnl.qnn.conv2d", pat + + +def make_qnn_dense_pattern(): + """Make qnn.dense based pattern supported by DNNL + + Returns + ------- + pattern : Tuple(pattern_name, CallPattern) + Created pattern name, along with its CallPattern. + """ + data = wildcard() + weight = is_constant() + bias = is_constant() + o_scl = is_constant() + dst_zp = is_constant() + act_scl = is_constant() + sum_scl = is_constant() + sum_src = wildcard() + + zero_zp = is_expr(const(0, dtype="int32")) + + pat = is_op("qnn.dense")(data, weight, zero_zp, zero_zp, is_constant(), is_constant()) + pat = is_op("cast")(pat) + pat = is_op("add")(pat, bias) | pat # optional bias + pat = is_op("multiply")(pat, o_scl) + pat = is_op("clip")(pat) # TBD, not only clip + pat = is_op("multiply")(pat, act_scl) | pat # optional multiply. ex act_scl == 1 + pat = is_op("add")(pat, sum_scl * is_op("cast")(sum_src)) | pat # optional sum + pat = is_op("add")(pat, dst_zp) | pat # optional dst_zp, can be dst_zp == 0 + pat = is_op("cast")(pat) + + return "dnnl.qnn.dense", pat + + @register_pattern_table("dnnl") def pattern_table(): """Create dnnl patterns. @@ -208,8 +278,11 @@ def pattern_table(): dnnl_patterns : List[dnnl_pattern] Created patterns. """ + dnnl_patterns = list() + dnnl_patterns.append(make_qnn_conv2d_pattern()) + dnnl_patterns.append(make_qnn_dense_pattern()) + elt_list = ["nn.relu", "tanh", "sigmoid", "gelu", None] - dnnl_patterns = [] for with_bias in [True, False]: for elt in elt_list: if not with_bias and not elt: @@ -707,3 +780,201 @@ def rewrite_dense_bias_gelu_reshape_last(mod): [DenseReshapeBiasGeluRewrite(), DenseReshapeBiasGeluRewrite(has_gelu=False)], mod["main"] ) return mod + + +class LegalizeQnnOpForDnnl(DFPatternCallback): + """Legalize QNN based patterns to match DNNL + + original pattern: + OP = qnn.dense | qnn.conv2d + %1 = OP(SRC, WGH) - OP(src_zp, WGH) // qnn.conv2d + %2 = %1 + orig_bias // bias + %2 = (%1 - rq_in_zp) * rq_in_scl / rq_out_scl + rq_out_zp // qnn.requantize + %3 = act(%2) // activation == clip + %4 = ((%3 - sum_lh_zp) * sum_lh_scl + (SRC2 - sum_rh_zp) * sum_rh_scl) // qnn.add + / sum_out_scl + sum_out_zp + + transform to DNNL compatible: + %1 = OP(SRC, WGH) + %2 = cast(%1, dtype="float") + %2 = (%1 + bias) * o_scl + %3 = act(%2) * act_scl + %4 = %3 + SRC2 * sum_scl + %5 = %4 + dst_zp + %6 = cast(%5, dtype="float") + + where: + o_scl = rq_in_scl / rq_out_scl + act_scl = sum_lhs_scl / sum_out_scl + sum_scl = sum_rhs_scl / sum_out_scl + bias = orig_bias - OP(src_zp, WGH) - rq_in_zp + rq_out_zp * rq_out_scl / rq_in_scl + dst_zp = sum_out_zp - sum_lhs_zp * sum_lhs_scl / sum_out_scl - + sum_rhs_zp * sum_rhs_scl / sum_out_scl + """ + + def __init__(self): + super(LegalizeQnnOpForDnnl, self).__init__() + self.src = wildcard() + self.wgh = wildcard() + self.bias = wildcard() + self.sum_src = wildcard() + + self.src_scl = is_constant() + self.src_zp = is_constant() + self.wgh_scl = is_constant() + self.wgh_zp = is_expr(const(0)) + + self.rq_in_scl = is_constant() + self.rq_in_zp = is_constant() + self.rq_out_scl = is_constant() + self.rq_out_zp = is_constant() + + self.sum_lhs_scl = is_constant() + self.sum_lhs_zp = is_constant() + self.sum_rhs_scl = is_constant() + self.sum_rhs_zp = is_constant() + self.sum_out_scl = is_constant() + self.sum_out_zp = is_constant() + + self.root = (is_op("qnn.conv2d") | is_op("qnn.dense"))( + self.src, self.wgh, self.src_zp, self.wgh_zp, self.src_scl, self.wgh_scl + ) + pat = is_op("add")(self.root, self.bias) | self.root # optional bias + pat = is_op("qnn.requantize")( + pat, self.rq_in_scl, self.rq_in_zp, self.rq_out_scl, self.rq_out_zp + ) + pat = is_op("clip")(pat) + cast = is_op("cast")(pat) + pat = is_op("qnn.add")( + cast, + self.sum_src, + self.sum_lhs_scl, + self.sum_lhs_zp, + self.sum_rhs_scl, + self.sum_rhs_zp, + self.sum_out_scl, + self.sum_out_zp, + ) + pat = is_op("clip")(pat) + self.pattern = pat | cast + + def callback(self, pre, post, node_map): + root = node_map[self.root][0] + src = node_map[self.src][0] + wgh = node_map[self.wgh][0] + bias = node_map.get(self.bias, default=[relay.const(0, dtype="int32")])[0] + src_zp = node_map[self.src_zp][0] + rq_in_scl = node_map[self.rq_in_scl][0] + rq_in_zp = node_map[self.rq_in_zp][0] + rq_out_scl = node_map[self.rq_out_scl][0] + rq_out_zp = node_map[self.rq_out_zp][0] + + final_dtype = node_map[self.pattern][0].checked_type.dtype + + if root.op == relay.op.get("qnn.conv2d"): + dst_layout = root.attrs.out_layout + dst_layout = root.attrs.data_layout if dst_layout == "" else dst_layout + wgh_layout = root.attrs.kernel_layout + else: + # qnn.dense has no layout attributes. Assume that is plain + dst_layout = "NC" + wgh_layout = "OI" + + # TODO(@apeskov): dst_layout may ne blocked + bias_rank = len(dst_layout) - dst_layout.index("C") + + sum_src = node_map[self.sum_src][0] if self.sum_src in node_map else None + # Default values if qnn.sum is not present + sum_lhs_scl = node_map[self.sum_lhs_scl][0] if sum_src else relay.const(1, dtype="float32") + sum_lhs_zp = node_map[self.sum_lhs_zp][0] if sum_src else relay.const(0, dtype="int32") + sum_rhs_scl = node_map[self.sum_rhs_scl][0] if sum_src else relay.const(0, dtype="float32") + sum_rhs_zp = node_map[self.sum_rhs_zp][0] if sum_src else relay.const(0, dtype="int32") + sum_out_scl = node_map[self.sum_out_scl][0] if sum_src else relay.const(1, dtype="float32") + sum_out_zp = node_map[self.sum_out_zp][0] if sum_src else relay.const(0, dtype="int32") + + def cast_fp(op): + return relay.op.cast(op, dtype="float32") + + # recalculate some factors + o_scl = rq_in_scl / rq_out_scl + act_scl = sum_lhs_scl / sum_out_scl + sum_scl = sum_rhs_scl / sum_out_scl + dst_zp = ( + cast_fp(sum_out_zp) + - cast_fp(sum_lhs_zp) * sum_lhs_scl / sum_out_scl + - cast_fp(sum_rhs_zp) * sum_rhs_scl / sum_out_scl + ) + bias = self.squeeze_bias(bias, dst_layout) + bias = ( + cast_fp(bias) + - cast_fp(self.fake_op(src_zp, wgh, wgh_layout)) + - cast_fp(rq_in_zp) + + cast_fp(rq_out_zp) * rq_out_scl / rq_in_scl + ) + bias = self.broadcast_to_rank(bias, bias_rank) + + zero_zp = relay.const(0, dtype="int32") + one_scl = relay.const(1.0, dtype="float32") + + # construct new graph with proper post op ordering + gr = tvm.relay.Call( + root.op, + [src, wgh, zero_zp, zero_zp, one_scl, one_scl], + root.attrs, + root.type_args, + root.span, + ) + gr = relay.op.cast(gr, dtype="float32") + gr = gr + bias + gr = gr * o_scl + gr = relay.op.clip(gr, 0, 255) * act_scl + gr = gr + sum_scl * cast_fp(sum_src) if sum_src else gr + gr = gr + dst_zp + gr = relay.op.cast(gr, dtype=final_dtype) + return gr + + @staticmethod + def fake_op(zp, wgh, layout): + """Fake operator implementation for zp broadcast input""" + # Conv: reduce kernel {OC, IC, KH, KW} -> {OC} in case of group that is still correct + # Dense: reduce kernel {OC, IC} -> {OC} + wgh_int = relay.op.cast(wgh, dtype="int32") + reduced_kernel = relay.op.sum( + wgh_int, axis=[layout.index("O")], keepdims=False, exclude=True + ) + return zp * reduced_kernel + + @staticmethod + def squeeze_bias(bias, layout): + shape = transform.InferTypeLocal(bias).concrete_shape + c_position = layout.index("C") - len(layout) + len(shape) + squeeze_idxs = [i for i in range(len(shape)) if i != c_position] + return relay.op.squeeze(bias, squeeze_idxs) + + @staticmethod + def broadcast_to_rank(op, rank): + """Scalar or 1D tensor are supported""" + shape = transform.InferTypeLocal(op).concrete_shape + if len(shape) == 0: + return op + if len(shape) == 1: + return relay.op.expand_dims(op, 1, rank - 1) + raise ValueError("Unexpected bias rank to broadcast. Only 0 and 1 are supported.") + + +def legalize_qnn_for_dnnl(mod): + """Transform qnn primitives to DNNL compatible form. Eliminate source zero point and apply + strict sequence of post ops.""" + mod["main"] = rewrite(LegalizeQnnOpForDnnl(), mod["main"]) + + seq = tvm.transform.Sequential( + [ + transform.InferType(), + # transform.SimplifyInference(), # TODO: this pass decompose nn.layer_norm + # transform.FoldScaleAxis(), # TODO: fail inside TVM in case of grouped convolutions. + transform.FoldConstant(), + ] + ) + with tvm.transform.PassContext(opt_level=3): + mod = seq(mod) + return mod diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py index 9d3a3aff21659..16790ca2c783c 100644 --- a/python/tvm/runtime/ndarray.py +++ b/python/tvm/runtime/ndarray.py @@ -236,18 +236,21 @@ def numpy(self): return np_arr_ret.reshape(shape) return np_arr - def copyto(self, target): + def copyto(self, target, mem_scope=None): """Copy array to target Parameters ---------- target : NDArray The target array to be copied, must have same shape as this array. + + mem_scope : Optional[str] + The memory scope of the array. """ if isinstance(target, NDArrayBase): return self._copyto(target) if isinstance(target, Device): - res = empty(self.shape, self.dtype, target) + res = empty(self.shape, self.dtype, target, mem_scope) return self._copyto(res) raise ValueError("Unsupported target type %s" % str(type(target))) @@ -574,7 +577,7 @@ def webgpu(dev_id=0): mtl = metal -def array(arr, device=cpu(0)): +def array(arr, device=cpu(0), mem_scope=None): """Create an array from source arr. Parameters @@ -585,6 +588,9 @@ def array(arr, device=cpu(0)): device : Device, optional The device device to create the array + mem_scope : Optional[str] + The memory scope of the array + Returns ------- ret : NDArray @@ -595,7 +601,7 @@ def array(arr, device=cpu(0)): if not isinstance(arr, (np.ndarray, NDArray)): arr = np.array(arr) - return empty(arr.shape, arr.dtype, device).copyfrom(arr) + return empty(arr.shape, arr.dtype, device, mem_scope).copyfrom(arr) # Register back to FFI diff --git a/src/meta_schedule/database/database.cc b/src/meta_schedule/database/database.cc index 9905ff73c792c..5adff49984946 100644 --- a/src/meta_schedule/database/database.cc +++ b/src/meta_schedule/database/database.cc @@ -85,6 +85,19 @@ TuningRecord::TuningRecord(tir::Trace trace, Workload workload, Optionaldata_ = n; } +MeasureCandidate TuningRecordNode::AsMeasureCandidate() const { + tir::Schedule sch = + tir::Schedule::Traced(workload->mod, -1, 0, tir::ScheduleErrorRenderLevel::kDetail); + trace->ApplyToSchedule(sch, false, nullptr); + tir::PrimFunc func; + for (const auto& kv : sch->mod()->functions) { + func = Downcast(kv.second); + } + Array args_info = ArgInfo::FromPrimFunc(func); + MeasureCandidate candidate = MeasureCandidate(sch, args_info); + return candidate; +} + ObjectRef TuningRecordNode::AsJSON() const { Optional> json_args_info{nullptr}; Optional json_target{nullptr}; @@ -152,12 +165,15 @@ TuningRecord TuningRecord::FromJSON(const ObjectRef& json_obj, const Workload& w Database Database::PyDatabase(PyDatabaseNode::FHasWorkload f_has_workload, PyDatabaseNode::FCommitWorkload f_commit_workload, PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record, - PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FSize f_size) { + PyDatabaseNode::FGetTopK f_get_top_k, + PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records, + PyDatabaseNode::FSize f_size) { ObjectPtr n = make_object(); n->f_has_workload = f_has_workload; n->f_commit_workload = f_commit_workload; n->f_commit_tuning_record = f_commit_tuning_record; n->f_get_top_k = f_get_top_k; + n->f_get_all_tuning_records = f_get_all_tuning_records; n->f_size = f_size; return Database(n); } @@ -179,6 +195,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuningRecord") Optional target, Optional> args_info) { return TuningRecord(trace, workload, run_secs, target, args_info); }); +TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsMeasureCandidate") + .set_body_method(&TuningRecordNode::AsMeasureCandidate); TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordAsJSON") .set_body_method(&TuningRecordNode::AsJSON); TVM_REGISTER_GLOBAL("meta_schedule.TuningRecordFromJSON").set_body_typed(TuningRecord::FromJSON); @@ -190,6 +208,8 @@ TVM_REGISTER_GLOBAL("meta_schedule.DatabaseCommitTuningRecord") .set_body_method(&DatabaseNode::CommitTuningRecord); TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetTopK") .set_body_method(&DatabaseNode::GetTopK); +TVM_REGISTER_GLOBAL("meta_schedule.DatabaseGetAllTuningRecords") + .set_body_method(&DatabaseNode::GetAllTuningRecords); TVM_REGISTER_GLOBAL("meta_schedule.DatabaseSize").set_body_method(&DatabaseNode::Size); TVM_REGISTER_GLOBAL("meta_schedule.DatabasePyDatabase").set_body_typed(Database::PyDatabase); diff --git a/src/meta_schedule/database/json_database.cc b/src/meta_schedule/database/json_database.cc index 4f5bd9b136131..9bb7ee1027b99 100644 --- a/src/meta_schedule/database/json_database.cc +++ b/src/meta_schedule/database/json_database.cc @@ -156,6 +156,15 @@ class JSONDatabaseNode : public DatabaseNode { return results; } + Array GetAllTuningRecords() { + Array results; + results.reserve(Size()); + for (const TuningRecord& record : this->tuning_records_) { + results.push_back(record); + } + return results; + } + int64_t Size() { return tuning_records_.size(); } }; diff --git a/src/meta_schedule/tune_context.cc b/src/meta_schedule/tune_context.cc index 0c70dcf5c406f..57b2344c6f8db 100644 --- a/src/meta_schedule/tune_context.cc +++ b/src/meta_schedule/tune_context.cc @@ -142,7 +142,9 @@ Array TuneContextNode::_Join() { results.push_back(future->Result()); } } - this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results); + if (this->search_strategy.defined()) { + this->search_strategy.value()->NotifyRunnerResults(this->measure_candidates.value(), results); + } ICHECK(this->measure_candidates.defined()); ICHECK(this->builder_results.defined()); ICHECK_EQ(results.size(), this->measure_candidates.value().size()); @@ -177,6 +179,16 @@ TVM_REGISTER_GLOBAL("meta_schedule.TuneContext") TVM_REGISTER_GLOBAL("meta_schedule._SHash2Hex").set_body_typed(SHash2Hex); TVM_REGISTER_GLOBAL("meta_schedule.TuneContextInitialize") .set_body_method(&TuneContextNode::Initialize); +TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSetMeasureCandidates") + .set_body_method(&TuneContextNode::_SetMeasureCandidates); +TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToBuilder") + .set_body_method(&TuneContextNode::_SendToBuilder); +TVM_REGISTER_GLOBAL("meta_schedule.TuneContextSendToRunner") + .set_body_method(&TuneContextNode::_SendToRunner); +TVM_REGISTER_GLOBAL("meta_schedule.TuneContextJoin") + .set_body_method(&TuneContextNode::_Join); +TVM_REGISTER_GLOBAL("meta_schedule.TuneContextClearMeasureState") + .set_body_method(&TuneContextNode::_ClearMeasureState); } // namespace meta_schedule } // namespace tvm diff --git a/src/relay/backend/contrib/cmsisnn/extract_constants.cc b/src/relay/backend/contrib/cmsisnn/extract_constants.cc index 1cbe36e30f765..c6ed7af9ff031 100644 --- a/src/relay/backend/contrib/cmsisnn/extract_constants.cc +++ b/src/relay/backend/contrib/cmsisnn/extract_constants.cc @@ -164,7 +164,18 @@ class ExtractConstantsMutator : public MixedModeMutator { function_signature.push_back(arg); } else { if (arg.as()) { - function_signature.push_back(arg); + // Only push if its not already present as multiple consumers of any input var + // will appear only once in the function signature. + bool found_in_existing_signature = false; + for (auto& sign : function_signature) { + if (arg.same_as(sign)) { + found_in_existing_signature = true; + break; + } + } + if (!found_in_existing_signature) { + function_signature.push_back(arg); + } } new_args.push_back(arg); } diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc index 524735caa9d6a..5c99061fa854b 100644 --- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc +++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc @@ -556,7 +556,12 @@ class RelayToTIRVisitor : public MixedModeMutator { BufferCreator buffer_creator; tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8)); - tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8)); + tir::Var input_1; + if (mul_call->args[0].same_as(mul_call->args[1])) { + input_1 = input_0; + } else { + input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8)); + } tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8)); tvm::Array args = { @@ -626,7 +631,12 @@ class RelayToTIRVisitor : public MixedModeMutator { BufferCreator buffer_creator; tir::Var input_0 = buffer_creator.CreateBufferVar("input_0", DataType::Handle(8)); - tir::Var input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8)); + tir::Var input_1; + if (add_call->args[0].same_as(add_call->args[1])) { + input_1 = input_0; + } else { + input_1 = buffer_creator.CreateBufferVar("input_1", DataType::Handle(8)); + } tir::Var output = buffer_creator.CreateBufferVar("output", DataType::Handle(8)); tvm::Array args = { diff --git a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc index 2448bfc766306..40fd773eb2092 100644 --- a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc +++ b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc @@ -179,6 +179,12 @@ class ScalarToTensorConstantMutator : public MixedModeMutator { auto new_body = VisitExpr(func->body); Function new_func = WithFields(func, FreeVars(new_body), new_body, func->ret_type, FreeTypeVars(new_body, mod_), func->attrs); + + // Updating new_func parameters could result into uniquification of function parameters. + // Call arguments need to be aligned to the number of arguments expected by new_func. + if (new_args[0].same_as(new_args[1])) { + new_args.erase(new_args.begin()); + } return Call(new_func, new_args); } diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc index 927cd12ae0fb9..f17cdafa76a5f 100644 --- a/src/relay/backend/contrib/dnnl/codegen.cc +++ b/src/relay/backend/contrib/dnnl/codegen.cc @@ -35,6 +35,7 @@ #include #include "../../utils.h" +#include "comp_op_matcher.h" #ifdef USE_JSON_RUNTIME #include "../../../../runtime/contrib/json/json_node.h" @@ -436,6 +437,30 @@ class DNNLModuleCodegen : public CSourceModuleCodegenBase { #else // DNNL JSON runtime +/*! + * \brief Replace var expr which bind with args of call node + * + * \param args vector of expression (contains vars or constant nodes) + * \param cn call node which describe mapping of internal body vars with args + * \return updated vector of expressions + */ +static tvm::Array BindToCallNodeArgs(const std::vector& args, const CallNode* cn) { + tvm::Array res; + for (const auto& arg : args) { + if (arg->IsInstance()) { + res.push_back(arg); + } else { + auto body_params = cn->op.as()->params; + auto found = std::find(body_params.begin(), body_params.end(), arg); + ICHECK(found != body_params.end()); + auto idx = std::distance(body_params.begin(), found); + res.push_back(cn->args[idx]); + } + } + return res; +} + +/*! \brief Serializer to DNNL JSON runtime module */ class DNNLJSONSerializer : public backend::contrib::JSONSerializer { using JSONGraphNode = tvm::runtime::json::JSONGraphNode; using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; @@ -475,14 +500,19 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer { } public: - DNNLJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {} + DNNLJSONSerializer(const std::string& symbol, const Expr& expr) + : JSONSerializer("dnnl_" + symbol, expr) {} std::vector VisitExpr_(const CallNode* cn) override { Expr expr = GetRef(cn); std::string name; + tvm::Array args; + std::unordered_map extra_attrs; + const CallNode* call = cn; if (const auto* op_node = cn->op.as()) { name = op_node->name; + args = cn->args; } else if (const auto* fn = cn->op.as()) { auto comp = fn->GetAttr(attr::kComposite); ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions."; @@ -511,15 +541,24 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer { } else if (name.find("dnnl.dense") != std::string::npos) { call = GetRootCall(fn->body.as(), 10, "nn.dense"); ICHECK(call->op.as()) << "Not op node"; + } else if (name.find("dnnl.qnn.conv2d") != std::string::npos || + name.find("dnnl.qnn.dense") != std::string::npos) { + std::vector args_loc; + call = ParseComposite(*fn, &extra_attrs, &args_loc); + args = BindToCallNodeArgs(args_loc, cn); } else { LOG(FATAL) << "Unrecognized DNNL pattern: " << name; } + + if (args.empty()) { + args = cn->args; + } } else { LOG(FATAL) << "DNNL JSON runtime does not support calls to " << cn->op->GetTypeKey(); } std::vector inputs; - for (const auto& arg : cn->args) { + for (const auto& arg : args) { auto res = VisitExpr(arg); inputs.insert(inputs.end(), res.begin(), res.end()); } @@ -527,6 +566,8 @@ class DNNLJSONSerializer : public backend::contrib::JSONSerializer { "kernel", /* op_type_ */ inputs, 1 /* num_outputs_ */); SetCallNodeAttribute(node, call); + for (const auto& kvp : extra_attrs) node->SetAttr(kvp.first, kvp.second); + return AddNode(node, GetRef(cn)); } }; @@ -558,6 +599,61 @@ runtime::Module DNNLCompiler(const ObjectRef& ref) { TVM_REGISTER_GLOBAL("relay.ext.dnnl").set_body_typed(DNNLCompiler); +/*! + * \brief Constant Updater for DNNL JSON runtime + * + * Not all originally existing ConstantNode should be passed to JSON runtime. + * Some of them may be skipped or change ordering. So we have to apply the same traversing through + * the graph as DNNLJSONSerializer. + */ +struct DNNLConstantUpdater : public ConstantUpdater { + public: + DNNLConstantUpdater(const std::string& symbol, + std::unordered_map* params) + : ConstantUpdater("dnnl_" + symbol, params) {} + using ConstantUpdater::VisitExpr_; + + void VisitExpr_(const CallNode* cn) final { + this->VisitSpan(cn->span); + + if (const auto* fn = cn->op.as()) { + std::vector args_loc; + std::unordered_map attrs; + auto root_cn = ParseComposite(*fn, &attrs, &args_loc); + + auto args = root_cn ? BindToCallNodeArgs(args_loc, cn) : cn->args; + + // Customized visit order of args + for (const auto& arg : args) { + this->VisitExpr(arg); + } + } else { + // Original visit order of args + for (auto arg : cn->args) { + this->VisitExpr(arg); + } + } + } +}; + +/*! + * \brief The external compiler/codegen tool. It takes a Relay expression/module and + * produce collection of required constant NDArrays. + */ +Map DNNLConstantUpdaterFunc(Expr expr, std::string symbol) { + // Visit all suitable constant nodes + std::unordered_map res; + DNNLConstantUpdater const_updater(symbol, &res); + const_updater(expr); + + // Convert to tvm::Map + Map ret; + for (const auto& kvp : res) ret.Set(kvp.first, kvp.second); + return ret; +} + +TVM_REGISTER_GLOBAL("relay.ext.dnnl.constant_updater").set_body_typed(DNNLConstantUpdaterFunc); + } // namespace contrib } // namespace relay } // namespace tvm diff --git a/src/relay/backend/contrib/dnnl/comp_op_matcher.h b/src/relay/backend/contrib/dnnl/comp_op_matcher.h new file mode 100644 index 0000000000000..364cc6e377ca8 --- /dev/null +++ b/src/relay/backend/contrib/dnnl/comp_op_matcher.h @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/backend/contrib/dnnl/comp_op_matcher.h + * \brief Implement matcher based function to parse complex composite nodes. + */ + +#ifndef TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_ +#define TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_ + +#include + +#include +#include +#include + +#include "../../../ir/dataflow_matcher_impl.h" + +/*! + * \brief Converter value to dmlc attr acceptable format + * + * \tparam T type of value (auto deduction) + * \param val value to convert + * \return resulting dmlc object + */ +template ::value, bool> = true> +dmlc::any dmlc_attr(const T& val) { + std::vector attr; + attr.emplace_back(std::vector{std::to_string(val)}); + return dmlc::any{attr}; +} + +template ::value, bool> = true> +dmlc::any dmlc_attr(const T& val) { + std::vector attr; + attr.emplace_back(std::vector{val}); + return dmlc::any{attr}; +} + +template >::value, bool> = true> +dmlc::any dmlc_attr(const T& val) { + std::vector attr; + attr.emplace_back(val); + return dmlc::any{attr}; +} + +/*! \brief Constructor of const scalar expression with defined type */ +tvm::relay::Expr constant(float val) { + auto value = tvm::runtime::NDArray::Empty({}, tvm::DataType::Float(32), {kDLCPU, 0}); + value.CopyFromBytes(&val, sizeof(val)); + auto res = tvm::relay::Constant(value); + tvm::relay::transform::InferTypeLocal(res); + return res; +} + +/*! + * \brief Simple helper to accumulate composite function arguments and corresponding attributes + * with indexes of them. + */ +class ArgPacker { + public: + ArgPacker(std::unordered_map* attrs, std::vector* args) + : attrs_(attrs), args_(args) {} + + int Put(const tvm::relay::Expr& arg, std::string tag_name = "") { + if (!arg.defined()) return -1; + int idx = args_->size(); + args_->push_back(arg); + if (!tag_name.empty()) { + attrs_->operator[](tag_name) = dmlc_attr(idx); + } + return idx; + } + + private: + std::unordered_map* attrs_; + std::vector* args_; +}; + +const tvm::relay::CallNode* ParseQnnConvComp(const tvm::relay::FunctionNode& comp_fn, + std::unordered_map* ext_attrs, + std::vector* args) { + using namespace tvm::relay; + + // Pattern + auto src = IsWildcard(); + auto wgh = IsWildcard(); + auto sum_src = IsWildcard(); + auto bias = IsConstant(); + + auto o_scl = IsConstant(); + auto act_scl = IsConstant(); + auto sum_scl = IsConstant(); + auto dst_zp = IsConstant(); + + DFPattern cnv; + DFPattern pat; + + cnv = IsOp("qnn.conv2d")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()}); + pat = IsOp("cast")({cnv}); + pat = IsOp("add")({pat, bias}) || pat; + pat = IsOp("multiply")({pat, o_scl}); + pat = IsOp("clip")({pat}); + pat = IsOp("multiply")({pat, act_scl}) || pat; + pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat; + pat = IsOp("add")({pat, dst_zp}) || pat; + pat = IsOp("cast")({pat}); + + // Check pattern match + auto indexed_body = CreateIndexedGraph(comp_fn.body); + DFPatternMatcher matcher(indexed_body.get()); + auto res = matcher.Match(pat, comp_fn.body); + ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic"; + + // Handle arguments in deterministic order + auto map = matcher.GetMemo(); + auto find = [&map](const DFPattern& pat) -> tvm::relay::Expr { + if (map.count(pat)) return map.at(pat)[0]; + return {}; + }; + + ArgPacker arg_holder(ext_attrs, args); + arg_holder.Put(find(src)); + arg_holder.Put(find(wgh)); + arg_holder.Put(find(bias), "bias_idx"); + arg_holder.Put(find(sum_src), "sum_idx"); + arg_holder.Put(find(o_scl), "o_scl_idx"); + arg_holder.Put(find(act_scl), "act_scl_idx"); + arg_holder.Put(find(sum_scl), "sum_scl_idx"); + arg_holder.Put(find(dst_zp), "dst_zp_idx"); + + // Activation. Default clip to simulate relu via uint8 cast + std::vector clip_attr{"clip"}; + auto act_scl_val = map.count(act_scl) ? find(act_scl) : constant(1.0); + clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val))); // act_scale + clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0)))); // alpha + clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0)))); // beta + (*ext_attrs)["activation"] = dmlc_attr(clip_attr); + + return map.at(cnv)[0].as(); +} + +const tvm::relay::CallNode* ParseQnnDenseComp(const tvm::relay::FunctionNode& comp_fn, + std::unordered_map* ext_attrs, + std::vector* args) { + using namespace tvm::relay; + + // Pattern + auto src = IsWildcard(); + auto wgh = IsWildcard(); + auto sum_src = IsWildcard(); + auto bias = IsConstant(); + + auto o_scl = IsConstant(); + auto act_scl = IsConstant(); + auto sum_scl = IsConstant(); + auto dst_zp = IsConstant(); + + DFPattern dns, act, pat; + + dns = IsOp("qnn.dense")({src, wgh, IsConstant(), IsConstant(), IsConstant(), IsConstant()}); + pat = IsOp("cast")({dns}); + pat = IsOp("add")({pat, bias}) || pat; + pat = IsOp("multiply")({pat, o_scl}); + pat = IsOp("clip")({pat}); + pat = IsOp("multiply")({pat, act_scl}) || pat; + pat = IsOp("add")({pat, sum_scl * IsOp("cast")({sum_src})}) || pat; + pat = IsOp("add")({pat, dst_zp}) || pat; + pat = IsOp("cast")({pat}); + + // Check pattern match + auto indexed_body = CreateIndexedGraph(comp_fn.body); + DFPatternMatcher matcher(indexed_body.get()); + auto res = matcher.Match(pat, comp_fn.body); + ICHECK(res) << "Mismatch of DNNL partitioner and codegen logic"; + + // Handle arguments in deterministic order + auto memo = matcher.GetMemo(); + auto find = [&memo](const DFPattern& pat) -> tvm::relay::Expr { + if (memo.count(pat)) return memo.at(pat)[0]; + return {}; + }; + + ArgPacker arg_holder(ext_attrs, args); + arg_holder.Put(find(src)); + arg_holder.Put(find(wgh)); + arg_holder.Put(find(bias), "bias_idx"); + arg_holder.Put(find(sum_src), "sum_idx"); + arg_holder.Put(find(o_scl), "o_scl_idx"); + arg_holder.Put(find(act_scl), "act_scl_idx"); + arg_holder.Put(find(sum_scl), "sum_scl_idx"); + arg_holder.Put(find(dst_zp), "dst_zp_idx"); + + // Activation. Default clip to simulate relu via uint8 cast + std::vector clip_attr{"clip"}; + auto act_scl_val = memo.count(act_scl) ? find(act_scl) : constant(1.0); + clip_attr.push_back(std::to_string(arg_holder.Put(act_scl_val))); // act_scale + clip_attr.push_back(std::to_string(arg_holder.Put(constant(0.0)))); // alpha + clip_attr.push_back(std::to_string(arg_holder.Put(constant(255.0)))); // beta + (*ext_attrs)["activation"] = dmlc_attr(clip_attr); + + return memo.at(dns)[0].as(); +} + +/*! + * Parse composite function and return real args, additional attributes and root call node + * @param comp_fn composite function to parse + * @param ext_attrs attr collection with additional attributes + * @param args real arguments of node + * @return root call node + */ +const tvm::relay::CallNode* ParseComposite(const tvm::relay::FunctionNode& comp_fn, + std::unordered_map* ext_attrs, + std::vector* args) { + auto comp = comp_fn.GetAttr(tvm::relay::attr::kComposite); + ICHECK(comp.defined()) << "DNNL JSON runtime only supports composite functions."; + auto name = comp.value(); + + const tvm::relay::CallNode* res = nullptr; + if (name == "dnnl.qnn.conv2d") + res = ParseQnnConvComp(comp_fn, ext_attrs, args); + else if (name == "dnnl.qnn.dense") + res = ParseQnnDenseComp(comp_fn, ext_attrs, args); + return res; +} + +#endif // TVM_RELAY_BACKEND_CONTRIB_DNNL_COMP_OP_MATCHER_H_ diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index 5b23843c95e62..8715900c0c4a1 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -193,7 +193,9 @@ class LowerToTECompute : public backend::MemoizedExprTranslatorchecked_type().as(); std::stringstream ss; - ss << "constant_" << const_index++; + std::string s = readable_name_stream_.str(); + std::replace(s.begin(), s.end(), '.', '_'); + ss << s << "_constant_" << const_index++; tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype, ss.str()); constant_tensors_[op] = tensor; return {tensor}; diff --git a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc index 5045f3323af7c..a4239186b4b33 100644 --- a/src/runtime/contrib/dnnl/dnnl_json_runtime.cc +++ b/src/runtime/contrib/dnnl/dnnl_json_runtime.cc @@ -134,9 +134,56 @@ class DNNLJSONRuntime : public JSONRuntimeBase { {"tanh", dnnl::algorithm::eltwise_tanh}, {"sigmoid", dnnl::algorithm::eltwise_logistic}, {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf}, }; - bool ParsingOpName(const std::string op_name, dnnl::primitive_attr attr) { + dnnl::primitive_attr ParseAttrs(const size_t& nid, TensorRequisite* bias_tr) { + dnnl::primitive_attr attr; + + // Post op attributes based on named inputs. + auto dst_zp_tr = GetInputByName(nid, "dst_zp_idx"); + auto o_scl_tr = GetInputByName(nid, "o_scl_idx"); + auto sum_scl_tr = GetInputByName(nid, "sum_scl_idx"); + + if (o_scl_tr) { + ICHECK(o_scl_tr.IsConstant()); + auto data = o_scl_tr.GetConstDataLikeVec(); + attr.set_output_scales(data.size() == 1 ? 0 : (1 << 1), data); + } + + auto activation = GetNodeAttr>(nodes_[nid], "activation", {"none"}); + if (activation[0] != "none") { + auto a_type = elt_name2algo.at(activation[0]); + auto a_scale = GetInput(nid, std::stoi(activation[1])).GetConstScalarData(); + auto a_alfa = GetInput(nid, std::stoi(activation[2])).GetConstScalarData(); + auto a_beta = GetInput(nid, std::stoi(activation[3])).GetConstScalarData(); + + auto ops = attr.get_post_ops(); + ops.append_eltwise(a_scale, a_type, a_alfa, a_beta); + attr.set_post_ops(ops); + } + + if (sum_scl_tr) { + auto scl = sum_scl_tr.GetConstScalarData(); + auto ops = attr.get_post_ops(); + ops.append_sum(scl); + attr.set_post_ops(ops); + } + + if (dst_zp_tr) { + auto zp = dst_zp_tr.GetConstScalarData(); + // Use linear post op instead of set_zero_points(). Because of limitation of int32 type, + // but we have to use float. + auto ops = attr.get_post_ops(); + ops.append_eltwise(1.0, dnnl::algorithm::eltwise_linear, 1.0, zp); + attr.set_post_ops(ops); + } + *bias_tr = GetInputByName(nid, "bias_idx"); + + if (o_scl_tr || activation[0] != "none" || sum_scl_tr || dst_zp_tr) return attr; + + // parsing of name to extract attributes + auto op_name = nodes_[nid].GetOpName(); // Define RegExp. std::regex bias_add_pat(".*_bias.*"); std::regex relu_pat(".*_relu.*"); @@ -163,7 +210,9 @@ class DNNLJSONRuntime : public JSONRuntimeBase { } // Parsing bias_add. - return std::regex_match(op_name, bias_add_pat) ? true : false; + *bias_tr = std::regex_match(op_name, bias_add_pat) ? GetInput(nid, 2) : TensorRequisite{}; + + return attr; } // Build up the engine based on the input graph. @@ -219,16 +268,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase { void Convolution(const size_t& nid) { auto node = nodes_[nid]; - auto op_name = node.GetOpName(); - dnnl::primitive_attr attr; - attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); - bool has_bias = ParsingOpName(op_name, attr); // Setup attributes. auto src_tr = GetInput(nid, 0); auto wgh_tr = GetInput(nid, 1); auto dst_tr = GetOutput(nid, 0); - auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1); + auto bias_tr = TensorRequisite{}; + + auto attr = ParseAttrs(nid, &bias_tr); + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + auto strides = GetNodeAttr>(node, "strides"); auto dilates = GetNodeAttr>(node, "dilation"); auto padding = GetNodeAttr>(node, "padding"); @@ -292,25 +341,29 @@ class DNNLJSONRuntime : public JSONRuntimeBase { auto scratchpad_tr = TensorRequisite::AsIs(conv_prim_desc.scratchpad_desc()); - Submit(dnnl::convolution_forward(conv_prim_desc), {{DNNL_ARG_SRC, src_tr}, - {DNNL_ARG_WEIGHTS, wgh_tr}, - {DNNL_ARG_BIAS, bias_tr}, - {DNNL_ARG_SCRATCHPAD, scratchpad_tr}, - {DNNL_ARG_DST, dst_tr}}); + // TODO(@apeskov): Simulation of inplace primitive. just as PoC. + auto sum_in_tr = GetInputByName(nid, "sum_idx").TreatAs(dst_layout); + + Submit(dnnl::convolution_forward(conv_prim_desc), + {{DNNL_ARG_SRC, src_tr}, + {DNNL_ARG_WEIGHTS, wgh_tr}, + {DNNL_ARG_BIAS, bias_tr}, + {DNNL_ARG_SCRATCHPAD, scratchpad_tr}, + {DNNL_ARG_DST, dst_tr}}, + {sum_in_tr, DNNL_ARG_DST}); } void Deconvolution(const size_t& nid) { auto node = nodes_[nid]; - auto op_name = node.GetOpName(); - dnnl::primitive_attr attr; - attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); - bool has_bias = ParsingOpName(op_name, attr); // Setup attributes. auto src_tr = GetInput(nid, 0); auto wgh_tr = GetInput(nid, 1); auto dst_tr = GetOutput(nid, 0); - auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1); + auto bias_tr = TensorRequisite{}; + + auto attr = ParseAttrs(nid, &bias_tr); + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); auto strides = GetNodeAttr>(node, "strides"); auto dilates = GetNodeAttr>(node, "dilation"); @@ -374,16 +427,15 @@ class DNNLJSONRuntime : public JSONRuntimeBase { void Dense(const size_t& nid) { auto node = nodes_[nid]; - auto op_name = node.GetOpName(); - dnnl::primitive_attr attr; - attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); - bool has_bias = ParsingOpName(op_name, attr); // Setup attributes. auto src_tr = GetInput(nid, 0); auto wgh_tr = GetInput(nid, 1); auto dst_tr = GetOutput(nid, 0); - auto bias_tr = has_bias ? GetInput(nid, 2) : GetInput(nid, -1); + auto bias_tr = TensorRequisite{}; + + auto attr = ParseAttrs(nid, &bias_tr); + attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); // Assumption that bias is correct and can be squeezed to 1D bias_tr = bias_tr.Reshape({dst_tr.dims()[1]}); @@ -403,11 +455,16 @@ class DNNLJSONRuntime : public JSONRuntimeBase { auto scratchpad_tr = TensorRequisite::AsIs(dense_prim_desc.scratchpad_desc()); - Submit(dnnl::inner_product_forward(dense_prim_desc), {{DNNL_ARG_SRC, src_tr}, - {DNNL_ARG_WEIGHTS, wgh_tr}, - {DNNL_ARG_BIAS, bias_tr}, - {DNNL_ARG_SCRATCHPAD, scratchpad_tr}, - {DNNL_ARG_DST, dst_tr}}); + // TODO(@apeskov): Simulation of inplace primitive. just as PoC. + auto sum_in_tr = GetInputByName(nid, "sum_idx"); + + Submit(dnnl::inner_product_forward(dense_prim_desc), + {{DNNL_ARG_SRC, src_tr}, + {DNNL_ARG_WEIGHTS, wgh_tr}, + {DNNL_ARG_BIAS, bias_tr}, + {DNNL_ARG_SCRATCHPAD, scratchpad_tr}, + {DNNL_ARG_DST, dst_tr}}, + {sum_in_tr, DNNL_ARG_DST}); } void BatchNorm(const size_t& nid) { @@ -675,6 +732,11 @@ class DNNLJSONRuntime : public JSONRuntimeBase { return res; } + TensorRequisite GetInputByName(const size_t& nid, const std::string& name) { + auto idx = GetNodeAttr(nodes_[nid], name, {"-1"}); + return GetInput(nid, idx); + } + TensorRequisite GetOutput(const size_t& nid, const int idx) { if (idx == -1) return {}; // -1 reserved value for empty input. @@ -692,8 +754,8 @@ class DNNLJSONRuntime : public JSONRuntimeBase { } /*! \brief Helper function to register primitive into execution queue */ - void Submit(const dnnl::primitive& prim, - const std::unordered_map& tr_args) { + void Submit(const dnnl::primitive& prim, const std::unordered_map& tr_args, + const std::pair& inplace_conf = {}) { // Register all provided TR arguments std::unordered_map prim_arg_id; TensorRegistry::ActionQue post_prim_actions; @@ -706,6 +768,18 @@ class DNNLJSONRuntime : public JSONRuntimeBase { prim_arg_id[key] = arg_id; } + // Simulate inplace primitive + if (auto tr = inplace_conf.first) { + auto arg_id = tensor_registry_.Register(tr, &net_); + auto dst_tr = tr_args.at(inplace_conf.second); + auto dst_arg_id = prim_arg_id.at(inplace_conf.second); + + // Register copy action direct before main primitive + dnnl::reorder::primitive_desc io_copy_pd(engine_, tr.desc(), engine_, dst_tr.desc()); + net_.push_back( + {dnnl::reorder(io_copy_pd), {{DNNL_ARG_SRC, arg_id}, {DNNL_ARG_DST, dst_arg_id}}}); + } + // Register main primitive net_.push_back({prim, prim_arg_id}); diff --git a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h index d02ceff5de823..bad4bc10edec3 100644 --- a/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h +++ b/src/runtime/contrib/dnnl/dnnl_tensor_requisite.h @@ -275,6 +275,7 @@ class TensorRequisite { * innermost. */ TensorRequisite TreatAs(const std::string& layout, std::string desired_logic_layout = "") const { + if (!defined()) return *this; if (desired_logic_layout.empty()) desired_logic_layout = DefaultLogicLayoutFor(layout); const auto origin_dims = dims(); diff --git a/src/runtime/pipeline/pipeline_executor.cc b/src/runtime/pipeline/pipeline_executor.cc index a191f816f7159..b5c560e255e3b 100644 --- a/src/runtime/pipeline/pipeline_executor.cc +++ b/src/runtime/pipeline/pipeline_executor.cc @@ -34,6 +34,9 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name, if (name == "get_num_outputs") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumOutputs(); }); + } else if (name == "get_num_inputs") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->NumInputs(); }); } else if (name == "get_input_pipeline_map") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { if (String::CanConvertFrom(args[0])) { @@ -87,7 +90,10 @@ PackedFunc PipelineExecutor::GetFunction(const std::string& name, return PackedFunc(); } } - +/*! + * brief Returns number of global inputs. + */ +int PipelineExecutor::NumInputs(void) { return input_connection_config_.GetInputNum(); } /*! * \brief set input to the runtime module. * \param input_name The input name. diff --git a/src/runtime/pipeline/pipeline_executor.h b/src/runtime/pipeline/pipeline_executor.h index 9f9b24bdf0bec..87b50ed3a1a90 100644 --- a/src/runtime/pipeline/pipeline_executor.h +++ b/src/runtime/pipeline/pipeline_executor.h @@ -115,6 +115,7 @@ class TVM_DLL PipelineExecutor : public ModuleNode { int NumOutputs() const { return num_outputs_; } /*!\brief Run the pipeline executor.*/ void Run(); + int NumInputs(); /*! * \brief Get a list output data. * \return A list of output data. diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h index 2cb7b4a6d24e5..540103d0186cb 100644 --- a/src/runtime/pipeline/pipeline_struct.h +++ b/src/runtime/pipeline/pipeline_struct.h @@ -560,6 +560,9 @@ struct InputConnectionConfig { } return input_connection[key]; } + /*!\brief Returns the number of global inputs through the input_runtime_map list size.*/ + int GetInputNum() { return input_runtime_map.size(); } + /*! * \brief Getting the global input index through the input name. * \param input_name The global input name. diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py index 2651435434b11..05c8daa20c215 100644 --- a/tests/micro/zephyr/test_zephyr.py +++ b/tests/micro/zephyr/test_zephyr.py @@ -17,7 +17,6 @@ import logging import os import pathlib -import sys import logging import pytest diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py index fec18c197e045..26604da0a64aa 100644 --- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py +++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py @@ -101,7 +101,7 @@ def make_model( def test_op_int8( op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point ): - """Tests QNN Conv2D operator for CMSIS-NN""" + """Tests QNN binary operator for CMSIS-NN""" interface_api = "c" use_unpacked_api = True test_runner = AOT_USMP_CORSTONE300_RUNNER @@ -145,6 +145,65 @@ def test_op_int8( ) +@skip_if_no_reference_system +@tvm.testing.requires_cmsisnn +@pytest.mark.parametrize("op", [relay.qnn.op.mul, relay.qnn.op.add]) +@pytest.mark.parametrize("relu_type", ["RELU", "NONE"]) +def test_same_input_to_binary_op(op, relu_type): + """Tests QNN binary operator for CMSIS-NN where both inputs are the same""" + interface_api = "c" + use_unpacked_api = True + test_runner = AOT_USMP_CORSTONE300_RUNNER + + dtype = "int8" + shape = [1, 16, 16, 3] + input_ = generate_variable("input") + input_scale = 0.256 + input_zero_point = 33 + + model = make_model( + op, + input_, + input_, + input_scale, + input_zero_point, + input_scale, + input_zero_point, + relu_type, + ) + orig_mod = make_module(model) + + cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod) + + # validate pattern matching + assert_partitioned_function(orig_mod, cmsisnn_mod) + + # Check if the number of internal function parameter is 1 + cmsisnn_global_func = cmsisnn_mod["tvmgen_default_cmsis_nn_main_0"] + assert ( + isinstance(cmsisnn_global_func.body, tvm.relay.expr.Call) + and len(cmsisnn_global_func.body.args) == 1 + ), "Composite function for the binary op should have only 1 parameter." + + # validate the output + in_min, in_max = get_range_for_dtype_str(dtype) + inputs = { + "input": np.random.randint(in_min, high=in_max, size=shape, dtype=dtype), + } + output_list = generate_ref_data(orig_mod["main"], inputs) + compile_and_run( + AOTTestModel( + module=cmsisnn_mod, + inputs=inputs, + outputs=output_list, + output_tolerance=1, + ), + test_runner, + interface_api, + use_unpacked_api, + ) + + def parameterize_for_constant_inputs(test): """Generates parameters in such a way so that at least one of the inputs is a constant, both can't be variables, both can't be scalars. diff --git a/tests/python/contrib/test_cmsisnn/test_extract_constants.py b/tests/python/contrib/test_cmsisnn/test_extract_constants.py index 8831596d40e63..7d3e81a9c79d2 100644 --- a/tests/python/contrib/test_cmsisnn/test_extract_constants.py +++ b/tests/python/contrib/test_cmsisnn/test_extract_constants.py @@ -116,6 +116,40 @@ def test_nested_function(): relay.transform.InferType()(mod) +@tvm.testing.requires_cmsisnn +def test_internal_function_with_duplicate_arguments(): + """Tests the pass ExternConstants when a composite function + is present within global function with repeating arguments + to one of the binary ops. + """ + input0 = relay.var("input0", shape=(8, 8)) + binary_op0 = input0 + input0 + binary_op1 = binary_op0 * relay.const(5.0, "float32") + local_func = relay.Function([input0], binary_op1, relay.TensorType((8, 8), "float32")) + local_func = set_composite_func_attr(local_func, "cmsis-nn") + + arg = relay.var("arg", shape=(8, 8)) + call_local_func = relay.Call(local_func, [arg]) + extern_func = relay.Function([arg], call_local_func, relay.TensorType((8, 8), "float32")) + + global_arg = relay.var("global_var", shape=(8, 8)) + global_var = relay.GlobalVar("external_function") + extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint) + call_extern_func = relay.Call(global_var, [global_arg]) + main_func = relay.Function([global_arg], call_extern_func, relay.TensorType((8, 8), "float32")) + main_var = relay.GlobalVar("main") + + mod = tvm.IRModule() + mod[global_var] = extern_func + mod[main_var] = main_func + + mod = ExtractConstantsFromPartitionedFunction()(mod) + constant_verifier = CheckFunctionsForConstants() + constant_verifier.visit_function(mod[global_var]) + constant_verifier.check_num_constants() + relay.transform.InferType()(mod) + + @tvm.testing.requires_cmsisnn def test_multiple_functions(): """Tests the pass ExternConstants when global function diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py index 557a65aeffcaf..df54f7ce55f15 100644 --- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py +++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py @@ -256,6 +256,47 @@ def test_all_primary_operands_tensor_constants(): assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body) +@tvm.testing.requires_cmsisnn +def test_duplicate_constant_arguments(): + """Tests the pass when repeating operands are arguments to the binary op""" + dtype = "int8" + shape = (1, 3, 3, 32) + operand0 = generate_variable("operand0", shape, dtype) + operand1 = generate_variable("operand1", shape, dtype) + binary_op = make_binary_op( + relay.qnn.op.add, + operand0, + operand0, + input_0_scale=0.0128, + input_0_zero_point=32, + input_1_scale=0.256, + input_1_zero_point=-64, + ) + + local_func = relay.Function([operand0, operand1], binary_op, relay.TensorType(shape, dtype)) + local_func = set_composite_func_attr(local_func, "cmsis-nn.qnn_add") + + rng = np.random.default_rng(12345) + arg0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype)) + call_local_func = relay.Call(local_func, [arg0, arg0]) + extern_func = relay.Function([], call_local_func, relay.TensorType(shape, dtype)) + + global_var = relay.GlobalVar("external_function") + extern_func = set_external_func_attr(extern_func, "cmsis-nn", global_var.name_hint) + call_extern_func = relay.Call(global_var, []) + main_func = relay.Function([], call_extern_func, relay.TensorType(shape, dtype)) + main_var = relay.GlobalVar("main") + + mod = tvm.IRModule() + mod[global_var] = extern_func + mod[main_var] = main_func + + mod = relay.transform.InferType()(mod) + mod = ScalarToTensorConstants()(mod) + new_mod = relay.transform.InferType()(mod) + assert tvm.ir.structural_equal(mod[global_var].body, new_mod[global_var].body) + + @tvm.testing.requires_cmsisnn def test_non_cmsisnn_ext_func(): """Non CMSISNN functions should not be altered.""" diff --git a/tests/python/contrib/test_dnnl.py b/tests/python/contrib/test_dnnl.py index c884665421cbf..2138eda086978 100755 --- a/tests/python/contrib/test_dnnl.py +++ b/tests/python/contrib/test_dnnl.py @@ -20,6 +20,7 @@ import sys import subprocess import math +import collections import tvm from tvm import relay @@ -51,7 +52,7 @@ def bf16_supported(): cpu_info = subprocess.check_output("sysctl -a", shell=True).strip().decode() for line in cpu_info.split("\n"): if line.startswith("hw.optional.avx512f"): - _bf16_supported = bool(line.split(":", 1)[1]) + _bf16_supported = bool(int(line.split(":", 1)[1])) elif sys.platform.startswith("linux"): _bf16_supported = "avx512" in open("/proc/cpuinfo", "r").read() return _bf16_supported @@ -114,6 +115,7 @@ def partition_for_dnnl(mod, params=None, alter_layout=True, prune_subgraphs=True mod = dnnl.rewrite_layer_norm(mod) mod = dnnl.rewrite_dense_bias_gelu_reshape_last(mod) + mod = dnnl.legalize_qnn_for_dnnl(mod) byoc_seq = tvm.transform.Sequential( [ @@ -1126,5 +1128,540 @@ def get_graph(act=None): ) +def permute_shape(shape, l_from="", l_to=""): + res_shape = [] + for label in l_to: + pos = l_from.find(label) + res_shape.append(shape[pos]) + + return res_shape + + +def expand_dim(shape, rank=0): + assert len(shape) == 1 + return shape + [1] * (rank - 1) + + +def filler_uni(low=0, high=1): + def filler_func(shape): + return np.random.uniform(low, high, shape) + + return filler_func + + +class QnnBuilder: + def __init__(self, qnn_profile=None): + self._args = {} + self._args_op = [] + self._qp = qnn_profile + + def arg(self, shape=[], dtype="float32", filler=filler_uni(), is_const=True): + if isinstance(filler, (int, float)): + value = np.full(shape, filler).astype(dtype) + else: + value = filler(shape).astype(dtype) + + if is_const: + res = relay.const(value, dtype=dtype) + else: + name = f"in_{len(self._args)}" + res = relay.var(name, shape=shape, dtype=dtype) + self._args[name] = value + self._args_op.append(res) + + return res + + def make_zp(self, mean_val, num_ch=1, dispersion=0.2): + if num_ch == 1: + return self.arg(shape=[], dtype="int32", filler=mean_val) + else: + low = int(mean_val * (1 - dispersion)) + high = int(mean_val * (1 + dispersion)) + return self.arg(shape=[num_ch], dtype="int32", filler=filler_uni(low, high)) + + def make_scl(self, mean_val, num_ch=1, dispersion=0.2): + if num_ch == 1: + return self.arg(shape=[], dtype="float32", filler=mean_val) + else: + low = mean_val * (1 - dispersion) + high = mean_val * (1 + dispersion) + return self.arg(shape=[num_ch], dtype="float32", filler=filler_uni(low, high)) + + def make_zp_and_scl(self, name, num_ch=1, dispersion=0.2): + is_per_channel = getattr(self._qp, f"{name}_pc") + zp_val = getattr(self._qp, f"{name}_zp") + scl_val = getattr(self._qp, f"{name}_scl") + + zp = self.make_zp(zp_val, num_ch if is_per_channel else 1, dispersion) + scl = self.make_scl(scl_val, num_ch if is_per_channel else 1, dispersion) + return zp, scl + + def finalize(self, op): + func = relay.Function(self._args_op, op) + mod = tvm.IRModule.from_expr(func) + mod = relay.transform.InferType()(mod) + return mod, self._args + + +def check_fully_annotated(mod, desired_compiler): + matched_ops = [] + other_ops = [] + + def _visit(node): + if isinstance(node, tvm.relay.Call): + op = node.op + if isinstance(op, relay.GlobalVar): + func = mod[op] + if "Compiler" in func.attrs and func.attrs["Compiler"] == desired_compiler: + matched_ops.append(op) + return + else: + other_ops.append(op) + + tvm.relay.analysis.post_order_visit(mod["main"].body, _visit) + + assert len(other_ops) == 0 and len(matched_ops) != 0, "Model is not fully DNNL compiled" + + +def check_result( + mod, + ref_mod, + map_inputs, + tol=1e-5, + target="llvm", + device=tvm.cpu(), + params=None, + ref_result=None, + atol=None, + desired_compiler="dnnl", +): + if atol is None: + atol = tol + + if desired_compiler is not None: + check_fully_annotated(mod, desired_compiler) + + if ref_result is None: + # Run the reference result + relay.backend.te_compiler.get().clear() + with tvm.transform.PassContext(opt_level=3): + ref_lib = relay.build(ref_mod, target=target, params=params) + ref_rt_mod = tvm.contrib.graph_executor.GraphModule(ref_lib["default"](device)) + + for name, data in map_inputs.items(): + ref_rt_mod.set_input(name, data) + ref_rt_mod.run() + out = ref_rt_mod.get_output(0) + ref_result = out.numpy() + + def check_vm_result(): + relay.backend.te_compiler.get().clear() + with tvm.transform.PassContext(opt_level=3): + exe = relay.vm.compile(mod, target=target, params=params) + code, lib = exe.save() + exe = tvm.runtime.vm.Executable.load_exec(code, lib) + vm = tvm.runtime.vm.VirtualMachine(exe, device) + output = vm.run(**map_inputs) + tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol) + + def check_graph_executor_result(): + relay.backend.te_compiler.get().clear() + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params=params) + rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](device)) + + rt_mod.run(**map_inputs) + output = rt_mod.get_output(0) + tvm.testing.assert_allclose(output.numpy(), ref_result, rtol=tol, atol=atol) + + check_vm_result() + check_graph_executor_result() + + +ConvProfile = collections.namedtuple( + "ConvProfile", + [ + "SHAPE", + "KER", + "STR", + "PAD", + "DEL", + "OC", + "GR", + "D_LAYOUT", + "K_LAYOUT", + ], +) +base_conv = ConvProfile( + SHAPE=[1, 8, 5, 5], + KER=[3, 3], + STR=[1, 1], + PAD=[1, 1], + DEL=[1, 1], + OC=16, + GR=1, + D_LAYOUT="NCHW", + K_LAYOUT="OIHW", +) +base_conv_nhwc = base_conv._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO") +base_conv_dilated = base_conv._replace(PAD=[2, 2], DEL=[2, 2]) +base_conv_no_pad = base_conv._replace(PAD=[0, 0]) +base_conv_no_pad_nhwc = base_conv_no_pad._replace(D_LAYOUT="NHWC", K_LAYOUT="HWIO") +base_conv_group_no_pad = base_conv_no_pad._replace(GR=2) +base_conv_dw_no_pad = base_conv_no_pad._replace(SHAPE=[1, 16, 5, 5], GR=16) + + +DenseProfile = collections.namedtuple("DenseProfile", ["N", "IC", "OC"]) +base_dense_profile = DenseProfile(N=2, IC=10, OC=16) + +ArgConstConfig = collections.namedtuple("ArgConstConfig", ["Data", "Weights", "Bias", "Sum"]) +acp_regular = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=None) +acp_no_bias = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=None) +acp_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=True, Sum=False) +acp_no_bias_with_sum = ArgConstConfig(Data=False, Weights=True, Bias=None, Sum=False) + +QuantizationConfig = collections.namedtuple( + "QuantizationConfig", + [ + "d_zp", + "d_scl", + "d_pc", + "k_zp", + "k_scl", + "k_pc", + "rq_zp", + "rq_scl", + "rq_pc", + "sum_zp", + "sum_scl", + "sum_pc", + "o_zp", + "o_scl", + "o_pc", + ], +) + +qp_regular = QuantizationConfig( + d_zp=0, + d_scl=0.2, + d_pc=False, + k_zp=0, + k_scl=0.1, + k_pc=False, + rq_zp=30, + rq_scl=0.2, + rq_pc=False, + sum_zp=15, + sum_scl=0.3, + sum_pc=False, + o_zp=5, + o_scl=0.2, + o_pc=False, +) +qp_asymmetric_data = qp_regular._replace( + d_zp=3, rq_zp=10, rq_scl=0.1, sum_zp=15, sum_scl=0.3, o_zp=4 +) + +qnn_conv_profiles = tvm.testing.parameter( + by_dict={ + # Pattern qnn.conv2d + qnn.requantize + "Base": (base_conv, acp_regular, qp_regular), + "NHWC": (base_conv_nhwc, acp_regular, qp_regular), + # Asymmetric input. NOTE: No pad! Input ZP is not compatible with padding + "Group": (base_conv_group_no_pad, acp_regular, qp_asymmetric_data), + "DW": (base_conv_dw_no_pad, acp_regular, qp_asymmetric_data), + "NoBias": (base_conv, acp_no_bias, qp_regular), + "AsymmetricInput": (base_conv_no_pad, acp_regular, qp_asymmetric_data), + "AsymmetricInput_NHWC": (base_conv_no_pad_nhwc, acp_regular, qp_asymmetric_data), + # Pattern Conv2d + Requantize + Sum + "WithSum": (base_conv_no_pad, acp_with_sum, qp_asymmetric_data), + "WithSum_NHWC": (base_conv_no_pad_nhwc, acp_with_sum, qp_asymmetric_data), + "WithSum_NoBias": (base_conv_no_pad, acp_no_bias_with_sum, qp_asymmetric_data), + } +) + + +@has_dnnl_codegen +def test_qnn_conv2d(qnn_conv_profiles): + def generate_model(p, c, q): + np.random.seed(0) + + N, IC, IH, IW = p.SHAPE + d_shape = p.SHAPE + w_shape = [p.OC, IC, *p.KER] + b_shape = [p.OC] + s_shape = [ + p.SHAPE[0], + p.OC, + (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1, + (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1, + ] + + if p.GR != 1: + w_shape[1] //= p.GR + + d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT) + s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT) + w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT) + + c_dim = p.D_LAYOUT.find("C") + b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim) + + bld = QnnBuilder(qnn_profile=q) + + # Start build a test graph + data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20)) + d_zp, d_scl = bld.make_zp_and_scl("d", IC) + + # Convolution + wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20)) + w_zp, w_scl = bld.make_zp_and_scl("k") + + op = tvm.relay.qnn.op.conv2d( + data, + wgh, + d_zp, + w_zp, + d_scl, + w_scl, + kernel_size=p.KER, + padding=p.PAD, + strides=p.STR, + dilation=p.DEL, + groups=p.GR, + channels=p.OC, + out_dtype="int32", + data_layout=p.D_LAYOUT, + kernel_layout=p.K_LAYOUT, + ) + # Optional bias + if c.Bias is not None: + bias = bld.arg( + shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50) + ) + op = tvm.relay.add(op, bias) + + # Re-quantization + rq_in_zp = bld.make_zp(0) + rq_in_scl = bld.make_scl(q.d_scl * q.k_scl) # in real cases that should be a vector + rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq") + + op = tvm.relay.qnn.op.requantize( + op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32" + ) + op = tvm.relay.clip( + op, a_min=0.0, a_max=255.0 + ) # pytorch frontend specific, I guess it's redundant + op = tvm.relay.cast(op, dtype="uint8") + + # Optional sum (ResNet like) + if c.Sum is not None: + sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum) + + lhs_zp, lhs_scl = bld.make_zp_and_scl("rq") + rhs_zp, rhs_scl = bld.make_zp_and_scl("sum") + out_zp, out_scl = bld.make_zp_and_scl("o") + + op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp) + op = tvm.relay.clip(op, a_min=0.0, a_max=255.0) + + return bld.finalize(op) + + conv_p, arg_p, quant_p = qnn_conv_profiles + ref_mod, args = generate_model(conv_p, arg_p, quant_p) + mod = partition_for_dnnl(ref_mod) + + # atol=1 means int values should match with +-1 quantum value tolerance + check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl") + + +conv_profiles = tvm.testing.parameter( + by_dict={ + "Base": (base_conv, acp_regular), + "NHWC": (base_conv_nhwc, acp_regular), + "Group": (base_conv_group_no_pad, acp_regular), + "DW": (base_conv_dw_no_pad, acp_regular), + "Dilated": (base_conv_dilated, acp_regular), + } +) + + +@has_dnnl_codegen +def test_conv2d_plus(conv_profiles): + def generate_model(p, c): + np.random.seed(0) + + N, IC, IH, IW = p.SHAPE + d_shape = p.SHAPE + w_shape = [p.OC, IC, *p.KER] + b_shape = [p.OC] + s_shape = [ + p.SHAPE[0], + p.OC, + (IH + 2 * p.PAD[0] - (p.KER[0] - 1) * p.DEL[0] - 1) // p.STR[0] + 1, + (IW + 2 * p.PAD[1] - (p.KER[1] - 1) * p.DEL[1] - 1) // p.STR[1] + 1, + ] + + if p.GR != 1: + w_shape[1] //= p.GR + + d_shape = permute_shape(d_shape, l_from="NCHW", l_to=p.D_LAYOUT) + s_shape = permute_shape(s_shape, l_from="NCHW", l_to=p.D_LAYOUT) + w_shape = permute_shape(w_shape, l_from="OIHW", l_to=p.K_LAYOUT) + + c_dim = p.D_LAYOUT.find("C") + # b_shape = expand_dim(b_shape, rank=len(p.D_LAYOUT) - c_dim) + + bld = QnnBuilder() + + op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data) + wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights) + op = tvm.relay.nn.conv2d( + op, + wgh, + kernel_size=p.KER, + padding=p.PAD, + strides=p.STR, + dilation=p.DEL, + groups=p.GR, + channels=p.OC, + out_dtype="float32", + data_layout=p.D_LAYOUT, + kernel_layout=p.K_LAYOUT, + ) + + if c.Bias is not None: + bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias) + op = tvm.relay.nn.bias_add(op, bias, axis=c_dim) + + if c.Sum is not None: + sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum) + op = tvm.relay.op.add(op, sum_in) + + return bld.finalize(op) + + conv_p, arg_p = conv_profiles + ref_mod, args = generate_model(conv_p, arg_p) + mod = partition_for_dnnl(ref_mod, alter_layout=False) + check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl") + + +qnn_dense_profiles = tvm.testing.parameter( + by_dict={ + # Pattern Dense + Requantize + "Base": (base_dense_profile, acp_regular, qp_regular), + "AsymmetricInput": (base_dense_profile, acp_regular, qp_asymmetric_data), + # Pattern Dense + Requantize + Sum + "AsymmetricInput_Sum": (base_dense_profile, acp_with_sum, qp_asymmetric_data), + } +) + + +@has_dnnl_codegen +def test_qnn_dense(qnn_dense_profiles): + def generate_model(p, c, q): + np.random.seed(0) + + d_shape = [p.N, p.IC] + w_shape = [p.OC, p.IC] + b_shape = [p.OC] + s_shape = [p.N, p.OC] + + bld = QnnBuilder(qnn_profile=q) + + # Start build a test graph + data = bld.arg(shape=d_shape, dtype="uint8", is_const=c.Data, filler=filler_uni(0, 20)) + d_zp, d_scl = bld.make_zp_and_scl("d", p.IC) + + # Convolution + wgh = bld.arg(shape=w_shape, dtype="int8", is_const=c.Weights, filler=filler_uni(-20, 20)) + w_zp, w_scl = bld.make_zp_and_scl("k") + + op = tvm.relay.qnn.op.dense( + data, wgh, d_zp, w_zp, d_scl, w_scl, units=p.OC, out_dtype="int32" + ) + # Optional bias + if c.Bias is not None: + bias = bld.arg( + shape=b_shape, dtype="int32", is_const=c.Bias, filler=filler_uni(-50, 50) + ) + op = tvm.relay.add(op, bias) + + # Re-quantization + rq_in_zp = bld.make_zp(0) + rq_in_scl = bld.make_scl(q.d_scl * q.k_scl) # in real cases that should be a vector + rq_out_zp, rq_out_scl = bld.make_zp_and_scl("rq") + + op = tvm.relay.qnn.op.requantize( + op, rq_in_scl, rq_in_zp, rq_out_scl, rq_out_zp, out_dtype="int32" + ) + op = tvm.relay.clip( + op, a_min=0.0, a_max=255.0 + ) # pytorch frontend specific, I guess it's redundant + op = tvm.relay.cast(op, dtype="uint8") + + # Optional sum (ResNet like) + if c.Sum is not None: + sum_in = bld.arg(dtype="uint8", shape=s_shape, filler=filler_uni(0, 10), is_const=c.Sum) + + lhs_zp, lhs_scl = bld.make_zp_and_scl("rq") + rhs_zp, rhs_scl = bld.make_zp_and_scl("sum") + out_zp, out_scl = bld.make_zp_and_scl("o") + + op = tvm.relay.qnn.op.add(op, sum_in, lhs_scl, lhs_zp, rhs_scl, rhs_zp, out_scl, out_zp) + op = tvm.relay.clip(op, a_min=0.0, a_max=255.0) + + return bld.finalize(op) + + conv_p, arg_p, quant_p = qnn_dense_profiles + ref_mod, args = generate_model(conv_p, arg_p, quant_p) + mod = partition_for_dnnl(ref_mod) + + # atol=1 means int values should match with +-1 quantum value tolerance + check_result(mod, ref_mod, args, tol=1e-10, atol=1, desired_compiler="dnnl") + + +dense_profiles = tvm.testing.parameter( + by_dict={ + "Base": (base_dense_profile, acp_regular), + "WithSum": (base_dense_profile, acp_with_sum), + } +) + + +@has_dnnl_codegen +def test_dense_plus(dense_profiles): + def generate_model(p, c): + np.random.seed(0) + + d_shape = [p.N, p.IC] + w_shape = [p.OC, p.IC] + b_shape = [p.OC] + s_shape = [p.N, p.OC] + + c_dim = 1 + + bld = QnnBuilder() + + op = bld.arg(shape=d_shape, dtype="float32", is_const=c.Data) + wgh = bld.arg(shape=w_shape, dtype="float32", is_const=c.Weights) + op = tvm.relay.nn.dense(op, wgh, out_dtype="float32") + + if c.Bias is not None: + bias = bld.arg(shape=b_shape, dtype="float32", is_const=c.Bias) + op = tvm.relay.nn.bias_add(op, bias, axis=c_dim) + + if c.Sum is not None: + sum_in = bld.arg(shape=s_shape, dtype="float32", is_const=c.Sum) + op = tvm.relay.op.add(op, sum_in) + + return bld.finalize(op) + + dense_p, arg_p = dense_profiles + ref_mod, args = generate_model(dense_p, arg_p) + mod = partition_for_dnnl(ref_mod) + check_result(mod, ref_mod, args, tol=1e-5, desired_compiler="dnnl") + + if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py index 113c7780c130f..e581c3d55d216 100644 --- a/tests/python/contrib/test_hexagon/benchmark_util.py +++ b/tests/python/contrib/test_hexagon/benchmark_util.py @@ -16,6 +16,57 @@ # under the License. import csv +import os +import pytest +import tempfile +import collections + + +def skip_bencharks_flag_and_reason(): + """ + Returns one of these tuples: + (False, '') or + (True, (a string describing why the test should be skipped)) + + NOTE: This function is a temporary measure to prevent the TVM CI system + running benchmark scripts every time the CI pre-commit hook executes. + This should go away when a better system is in place to govern when various + tests / benchmarks are executed. + """ + asn = os.environ.get("ANDROID_SERIAL_NUMBER") + + if asn == "simulator": + return (True, "Skipping benchmarks when ANDROID_SERIAL_NUMBER='simluator'") + else: + return (False, "") + + +class UnsupportedException(Exception): + """ + Indicates that the specified benchmarking configuration is known to + currently be unsupported. The Exception message may provide more detail. + """ + + +class NumericalAccuracyException(Exception): + """ + Indicates that the benchmarking configuration appeared to run successfully, + but the output data didn't have the expected accuracy. + """ + + +class UnsupportedException(Exception): + """ + Indicates that the specified benchmarking configuration is known to + currently be unsupported. The Exception message may provide more detail. + """ + + +class NumericalAccuracyException(Exception): + """ + Indicates that the benchmarking configuration appeared to run successfully, + but the output data didn't have the expected accuracy. + """ class BenchmarksTable: @@ -173,3 +224,62 @@ def get_benchmark_decription(keys_dict): other characters that make it unsuitable for use as a filename. """ return " ".join([f"{k}={v}" for k, v in keys_dict.items()]) + + +# This fixture provides some initialization / finalization logic for groups of related +# benchmark runs. +# See the fixture implementation below for details. +# +# The fixture's mechanics are described here: https://stackoverflow.com/a/63047695 +# +# TODO: There may be cleaner ways to let each class that uses this fixture provide its +# own value for `csv_column_order`. +# +# TODO: In the future we may wish to break this fixture up in to several smaller ones. +# +# The overall contract for a class (e.g. `MyTest`) using this fixture is as follows: +# +# https://stackoverflow.com/a/63047695 +# +# @pytest.mark.usefixtures("benchmark_group") +# class MyTest: +# +# # The fixture requires that this class variable is defined before +# # the fixture's finalizer-logic executes. +# # +# # This is used as an argument to BenchmarkTable.print_csv(...) after +# # all of MyTest's unit tests have executed. +# csv_column_order = [ +# ... +# ] +# +# # Before the MyTest's first unit test executes, the fixture will populate the +# # following class variables: +# MyTest.working_dir : str +# MyTest.benchmark_table : BenchmarkTable +@pytest.fixture(scope="class") +def benchmark_group(request): + working_dir = tempfile.mkdtemp() + bt = BenchmarksTable() + + request.cls.working_dir = working_dir + request.cls.benchmark_table = bt + + yield + + tabular_output_filename = os.path.join(working_dir, "benchmark-results.csv") + + if not hasattr(request.cls, "csv_column_order"): + raise Exception('Classes using this fixture must have a member named "csv_column_order"') + + with open(tabular_output_filename, "w") as csv_file: + bt.print_csv(csv_file, request.cls.csv_column_order) + + print() + print("*" * 80) + print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}") + print("*" * 80) + print() + + if bt.has_fail() > 0: + pytest.fail("At least one benchmark configuration failed", pytrace=False) diff --git a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py similarity index 93% rename from tests/python/contrib/test_hexagon/benchmark_elemwise_add.py rename to tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py index 70266d7939bc5..f7f5f3e176e46 100644 --- a/tests/python/contrib/test_hexagon/benchmark_elemwise_add.py +++ b/tests/python/contrib/test_hexagon/test_benchmark_elemwise_add.py @@ -28,7 +28,9 @@ from tvm.script import tir as T from tvm import te from tvm.contrib.hexagon.build import HexagonLauncherRPC -from . import benchmark_util +from . import benchmark_util as bu + +_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason() # This is a fixed detail of the v68 architecture. HVX_VECTOR_BYTES = 128 @@ -43,7 +45,7 @@ # triggering TIME_WAIT state on the server socket. This prevents another # server to bind to the same port until the wait time elapses. -_BT = benchmark_util.BenchmarksTable() +_BT = bu.BenchmarksTable() _CSV_COLUMN_ORDER = [ # Identifies which TE-compute / TIRScript is used as the basis for the @@ -88,21 +90,6 @@ print("-" * 80) print() - -class UnsupportedException(Exception): - """ - Indicates that the specified benchmarking configuration is known to - currently be unsupported. The Exception message may provide more detail. - """ - - -class NumericalAccuracyException(Exception): - """ - Indicates that the benchmarking configuration appeared to run successfully, - but the output data didn't have the expected accuracy. - """ - - from typing import Tuple @@ -129,7 +116,7 @@ def _get_irmod_elemwise_add( dtype_str = str(dtype) if mem_scope == "global.vtcm": - raise UnsupportedException("This benchmark kernel does not yet support VTCM buffers.") + raise bu.UnsupportedException("This benchmark kernel does not yet support VTCM buffers.") # This check is currently elided by the one above, but it should become relevant as soon # as we add VTCM support to this kernel generator. @@ -147,7 +134,7 @@ def _get_irmod_elemwise_add( estimated_vtcm_needed_bytes = shape[0] * shape[1] * dtype_bytes * num_vtcm_tensors if estimated_vtcm_needed_bytes > estimated_vtcm_budget_bytes: - raise UnsupportedException("Expect to exceed VTCM budget.") + raise bu.UnsupportedException("Expect to exceed VTCM budget.") @tvm.script.ir_module class BenchmarkModule: @@ -190,10 +177,10 @@ def _benchmark_hexagon_elementwise_add_kernel( "mem_scope": mem_scope, } - desc = benchmark_util.get_benchmark_decription(keys_dict) + desc = bu.get_benchmark_decription(keys_dict) # Create the host-side directory for this benchmark run's files / logs... - host_files_dir_name = benchmark_util.get_benchmark_id(keys_dict) + host_files_dir_name = bu.get_benchmark_id(keys_dict) host_files_dir_path = os.path.join(_HOST_OUTPUT_DIR, host_files_dir_name) os.mkdir(host_files_dir_path) @@ -238,7 +225,9 @@ def _benchmark_hexagon_elementwise_add_kernel( # Upload the .so to the Android device's file system (or wherever is appropriate # when using the Hexagon simulator)... target_dso_binary_filename = "test_binary.so" - hexagon_launcher.upload(host_dso_binary_path, target_dso_binary_filename) + target_dso_binary_pathname = hexagon_launcher.upload( + host_dso_binary_path, target_dso_binary_filename + ) # Generate our testing / validation data... ( @@ -251,7 +240,7 @@ def _benchmark_hexagon_elementwise_add_kernel( # On the target device / simulator, make our Hexagon-native shared object # available for use... loaded_hexagon_module: tvm.runtime.module.Module = hexagon_launcher.load_module( - target_dso_binary_filename, sess + target_dso_binary_pathname, sess ) # Create the target-side tensors to hold the primfunc's inputs and outputs... @@ -296,11 +285,11 @@ def _benchmark_hexagon_elementwise_add_kernel( result, host_numpy_C_data_expected, rel_tolerance, abs_tolerance ) except AssertionError as e: - raise NumericalAccuracyException(str(e)) + raise bu.NumericalAccuracyException(str(e)) _BT.record_success(timing_result, **keys_dict) - except NumericalAccuracyException as e: + except bu.NumericalAccuracyException as e: print() print(f"FAIL: Numerical accuracy error. See log file.") @@ -309,7 +298,7 @@ def _benchmark_hexagon_elementwise_add_kernel( _BT.record_fail(**keys_dict, comments=f"Numerical accuracy error. See log file.") - except UnsupportedException as e: + except bu.UnsupportedException as e: print() print(f"SKIP: {e}") @@ -381,6 +370,7 @@ def _get_elemwise_add_reference_value_tensors(shape: list, dtype: str): ] +@pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON) @tvm.testing.requires_hexagon def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC): for dtype in [ @@ -432,3 +422,7 @@ def test_elemwise_add(hexagon_launcher: HexagonLauncherRPC): if _BT.has_fail() > 0: pytest.fail("At least one benchmark configuration failed", pytrace=False) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py new file mode 100644 index 0000000000000..41169494417a2 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py @@ -0,0 +1,351 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +This module serves two purposes: + (1) Demonstrates how to write Python code that exercises various + Hexagon-related algorithms / features. + + (2) Benchmark the resulting primfuncs. + +Current limitations: + - Input shapes are limited to NHWC --> NHWC_8h8w32c. + + - Testing parameters (input shapes, dtypes, etc.) currently + support only one value for each parameter. + + - H, W, C must be integer multiples of 8, 8, and 32, + respectively. I.e., partial blocks aren't currently + supported by this script. + + - Requires that I/O tensors reside in "global.VTCM" memory, + rather than "global" memory. + This prevents benchmarking with I/O tensors that are too + large to fit into availble VTCM. + + - The script only develops one primfunc. + Future revisions to this script are expected to add more + primfuncs and demonstrate more coding strategies. +""" + +import sys +import pytest +import numpy as np +import copy +import os + +import tvm.testing +from tvm import te, topi, tir +from tvm.topi import testing +from tvm.script import tir as T +from tvm.tir import IndexMap +from tvm.relay.backend import Executor, Runtime +from tvm.contrib.hexagon.session import Session +from typing import List + +from .infrastructure import allocate_hexagon_array +from . import benchmark_util as bu +from .benchmark_util import benchmark_group + +_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason() + + +def _ceil_div(numerator, denominator): + return (numerator + (denominator - 1)) // denominator + + +def _int8_nhwc_8h8w32c_map(n, h, w, c): + return [ + n, + h // 8, + w // 8, + c // 32, + te.AXIS_SEPARATOR, + h % 8, + w % 8, + c % 32, + ] + + +def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]: + return [ + n, + _ceil_div(h, 8), + _ceil_div(w, 8), + _ceil_div(c, 32), + 8, + 8, + 32, + ] + + +def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray: + """ + Return a deep copy of 'arr_in', transformed from a NWHC to + NHWC-8h8wc32 shape. Any newly created array elements have value 0. + """ + stage1 = copy.copy(arr_in) + + ( + n, + h, + w, + c, + ) = stage1.shape + + ( + h_minor, + w_minor, + c_minor, + ) = [8, 8, 32] + + h_major = _ceil_div(h, h_minor) + w_major = _ceil_div(w, w_minor) + c_major = _ceil_div(c, c_minor) + + # This handles cases where the dimensions of arr_in are not cleanly divided + # by the minor block size, i.e. [8, 8, 32]. + # + # Any additional array elements that this creates will ahve value 0. + # We shouldn't actually care what value is used for those elements, because they + # shouldn't be treated as meaningful by any of our algorithms. + if (h % h_minor) or (w % w_minor) or (c % c_minor): + stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False) + + stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor) + stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6) + return stage3 + + +def _create_test_input(shape, dtype: str) -> np.ndarray: + np_dtype = np.dtype(dtype) + min_value = np.iinfo(np_dtype).min + max_value = np.iinfo(np_dtype).max + return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8) + + +@pytest.mark.usefixtures("benchmark_group") +class TestMaxPool2D: + csv_column_order = [ + # Identifies which TE-compute / TIRScript is used as the basis for the + # benchmarked primfunc. Only needs to be meaningful to humans. + "basic_kernel", + # When applicable, indicates the particular variation of schedules + # apply by the Python code. Decoding this may require looking at this + # script's source code. + "sched_type", + # Values directly based on test parameters... + "input_shape_4d", + "block_shape", + "DTYPE", + "KERNEL", + "STRIDE", + "DILATION", + "PADDING", + "IO_TENSOR_MEM_SCOPE", + # Reserved columns defined by the BenchmarksTable class. + "row_status", + "timings_min_usecs", + "timings_max_usecs", + "timings_median_usecs", + "timings_mean_usecs", + "timings_stddev_usecs", + # For benchmarks that produce files on the host file system, this indicates + # their location. Useful for post-mortem investigation of benchmark results. + "host_files_dir_path", + # Miscellaneous comments about the benchmark. + "comments", + ] + + DTYPE = tvm.testing.parameter("int8") + + # FIXME(cconvey): The script currently fails when H, W, or C is not an + # integer multiple of 8, 8, or 32, respectively. + N = tvm.testing.parameter(1) + H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) + W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]]) + C = tvm.testing.parameter(*[x * 32 for x in [1, 2]]) + + KERNEL = tvm.testing.parameter((1, 1), (3, 3)) + STRIDE = tvm.testing.parameter((1, 1)) + DILATION = tvm.testing.parameter((1, 1)) + PADDING = tvm.testing.parameter((0, 0, 0, 0)) + IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm") + + @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON) + @tvm.testing.requires_hexagon + def test_maxpool2d_nhwc( + self, + N, + H, + W, + C, + DTYPE, + KERNEL, + STRIDE, + DILATION, + PADDING, + IO_TENSOR_MEM_SCOPE, + hexagon_session: Session, + ): + keys_dict = { + "basic_kernel": "max_pool2d", + "sched_type": 1, + "input_shape_4d": [N, H, W, C], + "block_shape": [8, 8, 32], + "DTYPE": DTYPE, + "KERNEL": KERNEL, + "STRIDE": STRIDE, + "DILATION": DILATION, + "PADDING": PADDING, + "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE, + } + + desc = bu.get_benchmark_decription(keys_dict) + + # Create the host-side directory for this benchmark run's files / logs... + host_files_dir_name = bu.get_benchmark_id(keys_dict) + host_files_dir_path = os.path.join(self.working_dir, host_files_dir_name) + os.mkdir(host_files_dir_path) + + keys_dict["host_files_dir_path"] = host_files_dir_path + + log_file_path = os.path.join(host_files_dir_path, "out.txt") + with open(log_file_path, "w") as log_file: + print(f"CONFIGURATION: {desc}") + log_file.write(f"CONFIGURATION: {desc}\n") + + try: + input_tensor_shape_4d = [N, H, W, C] + input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C) + + data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE) + + output = topi.nn.pool2d( + data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC" + ) + primfunc = te.create_prim_func([data, output]) + + sch = tir.Schedule(primfunc, debug_mask="all") + + sch.transform_layout( + block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map + ) + + target_hexagon = tvm.target.hexagon("v69", link_params=True) + # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)) + built_module = tvm.build( + sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + + # Save a local copy of the Hexagon object code (in the form of a .so file) + # to allow post-mortem inspection. + host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so") + built_module.save(host_dso_binary_path) + print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}") + + hexagon_mod = hexagon_session.load_module(built_module) + + # Generate the input tensor's data. + # Note that we'll eventually need it in two different layouts: + # (1) NHWC as an argument to testing.poolnd_python. + # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc. + # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8) + a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE) + + ref_output_4d = testing.poolnd_python( + a_numpy_4d.astype("int32"), + KERNEL, + STRIDE, + DILATION, + PADDING[0:2], + PADDING[2:], + pool_type="max", + dtype="int32", + layout="NHWC", + ).astype(DTYPE) + + output_tensor_shape_4d = ref_output_4d.shape + + a_numpy_7d = _int8_nhwc_8h8w32c_xform_immediate(a_numpy_4d) + + a_hexagon_7d = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=input_tensor_shape_7d, + axis_separators=[4], + dtype=DTYPE, + mem_scope=IO_TENSOR_MEM_SCOPE, + ) + + c_hexagon_4d = allocate_hexagon_array( + hexagon_session.device, + tensor_shape=output_tensor_shape_4d, + axis_separators=[], + dtype=DTYPE, + mem_scope=IO_TENSOR_MEM_SCOPE, + ) + + a_hexagon_7d.copyfrom(a_numpy_7d) + + if DTYPE == "int8": + rel_tolerance = 0 + abs_tolerance = 0 + else: + assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}" + + # hexagon_mod(a_hexagon_7d, c_hexagon_4d) + # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance) + + timer = hexagon_mod.time_evaluator( + "main", hexagon_session.device, number=10, repeat=1 + ) + timing_result = timer(a_hexagon_7d, c_hexagon_4d) + + try: + tvm.testing.assert_allclose( + ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance + ) + except AssertionError as e: + raise bu.NumericalAccuracyException(str(e)) + + except bu.NumericalAccuracyException as e: + print() + print(f"FAIL: Numerical accuracy error. See log file.") + + log_file.write("\n") + log_file.write(f"FAIL: {e}\n") + + self.benchmark_table.record_fail( + **keys_dict, comments=f"Numerical accuracy error. See log file." + ) + + except bu.UnsupportedException as e: + print() + print(f"SKIP: {e}") + + log_file.write("\n") + log_file.write(f"SKIP: {e}\n") + + self.benchmark_table.record_skip( + **keys_dict, comments=f"Unsupported configuration: {e}" + ) + + self.benchmark_table.record_success(timing_result, **keys_dict) + + +if __name__ == "__main__": + tvm.testing.main() diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 1a4f23ad467a5..987d425aa63d4 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -44,6 +44,7 @@ create_relay_module_and_inputs_from_tflite_file, ) from tvm.micro.testing.aot_test_utils import AOT_DEFAULT_RUNNER, parametrize_aot_options +from tvm.micro.testing.utils import get_conv2d_relay_module def test_error_c_interface_with_packed_api(): @@ -76,22 +77,7 @@ def test_error_c_interface_with_packed_api(): @parametrize_aot_options def test_conv_with_params(interface_api, use_unpacked_api, test_runner): """Tests compilation of convolution with parameters""" - relay_model = """ -#[version = "0.0.5"] -def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) { - %1 = nn.conv2d( - %data, - %weight, - padding=[2, 2], - channels=8, - kernel_size=[5, 5], - data_layout="NCHW", - kernel_layout="OIHW", - out_dtype="int32"); - %1 -} -""" - mod = tvm.parser.fromtext(relay_model) + mod = get_conv2d_relay_module() main_func = mod["main"] shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params} type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params} @@ -576,23 +562,7 @@ def test_multiple_models(interface_api, use_unpacked_api, test_runner): params1 = None # Convolution model - relay_model = """ - #[version = "0.0.5"] - def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) { - %1 = nn.conv2d( - %data, - %weight, - padding=[2, 2], - channels=8, - kernel_size=[5, 5], - data_layout="NCHW", - kernel_layout="OIHW", - out_dtype="int32"); - %1 - } - """ - - mod2 = tvm.parser.fromtext(relay_model) + mod2 = get_conv2d_relay_module() main_func = mod2["main"] shape_dict = {p.name_hint: p.checked_type.concrete_shape for p in main_func.params} type_dict = {p.name_hint: p.checked_type.dtype for p in main_func.params} diff --git a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py index 31a812b38eed7..3d6690a1a16f8 100644 --- a/tests/python/relay/strategy/arm_cpu/test_avg_pool.py +++ b/tests/python/relay/strategy/arm_cpu/test_avg_pool.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay @@ -165,4 +163,4 @@ class TestAvgPool3d(BasicPoolTests): if __name__ == "__main__": - sys.exit(pytest.main([__file__] + sys.argv[1:])) + tvm.testing.main() diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py index 0f0507cfe7d3d..b1dda10c42944 100644 --- a/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py +++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_ncw.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay @@ -114,4 +112,4 @@ class TestConv1d_ncw(BasicConv1dTests): if __name__ == "__main__": - sys.exit(pytest.main([__file__] + sys.argv[1:])) + tvm.testing.main() diff --git a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py index e430ade2fac14..3daed6221f68e 100644 --- a/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py +++ b/tests/python/relay/strategy/arm_cpu/test_conv1d_nwc.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay @@ -142,4 +140,4 @@ class TestConv1d_nwc(BasicConv1dTests): if __name__ == "__main__": - sys.exit(pytest.main([__file__] + sys.argv[1:])) + tvm.testing.main() diff --git a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py index 3b43d37c9075f..8ca132ffba759 100644 --- a/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py +++ b/tests/python/relay/strategy/arm_cpu/test_conv2d_NCHWc.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay @@ -135,4 +133,4 @@ class TestConv2d_NCHWc(BasicConv2dTests): if __name__ == "__main__": - sys.exit(pytest.main([__file__] + sys.argv[1:])) + tvm.testing.main() diff --git a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py index 3edffba8acaa6..a69ea6c09e790 100644 --- a/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py +++ b/tests/python/relay/strategy/arm_cpu/test_dense_dsp.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay @@ -87,4 +85,4 @@ class TestDense(BasicDenseTests): if __name__ == "__main__": - sys.exit(pytest.main([__file__] + sys.argv[1:])) + tvm.testing.main() diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py index 96628a6371d00..ee0d51c321f79 100644 --- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py +++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay diff --git a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py index 69e9ab09e4c95..178b44edbd403 100644 --- a/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py +++ b/tests/python/relay/strategy/arm_cpu/test_depthwise_conv2d_NCHWc.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay @@ -118,4 +116,4 @@ class TestDepthWiseConv2d_NCHWc(BasicConv2dTests): if __name__ == "__main__": - sys.exit(pytest.main([__file__] + sys.argv[1:])) + tvm.testing.main() diff --git a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py index b24c651de988f..47fe6d9f74c25 100644 --- a/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py +++ b/tests/python/relay/strategy/arm_cpu/test_group_conv2d.py @@ -14,9 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay diff --git a/tests/python/relay/strategy/arm_cpu/test_max_pool.py b/tests/python/relay/strategy/arm_cpu/test_max_pool.py index f58a041ecb746..ee890261d1b42 100644 --- a/tests/python/relay/strategy/arm_cpu/test_max_pool.py +++ b/tests/python/relay/strategy/arm_cpu/test_max_pool.py @@ -14,10 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from pickle import FALSE -import sys import numpy as np -import pytest import tvm import tvm.testing from tvm import relay @@ -132,4 +129,4 @@ class TestMaxPool3d(BasicPoolTests): if __name__ == "__main__": - sys.exit(pytest.main([__file__] + sys.argv[1:])) + tvm.testing.main() diff --git a/tests/python/relay/test_pass_partition_graph.py b/tests/python/relay/test_pass_partition_graph.py index dedeae56e9daf..58b41189a0f0c 100644 --- a/tests/python/relay/test_pass_partition_graph.py +++ b/tests/python/relay/test_pass_partition_graph.py @@ -926,11 +926,11 @@ def test_dnnl_fuse(): conv2d_relu_pat, conv2d_sigmoid_pat, ) = ( - dnnl_patterns[1], - dnnl_patterns[13], - dnnl_patterns[20], - dnnl_patterns[26], - dnnl_patterns[38], + dnnl_patterns[3], + dnnl_patterns[15], + dnnl_patterns[22], + dnnl_patterns[28], + dnnl_patterns[40], ) def get_blocks( diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py index 541f3bba13da2..06614977d4caf 100644 --- a/tests/python/relay/test_pipeline_executor.py +++ b/tests/python/relay/test_pipeline_executor.py @@ -595,6 +595,8 @@ def test_pipeline(): if input_map[0] == "0": input_data = pipeline_module_test.get_input("data_a") tvm.testing.assert_allclose(data, input_data.numpy()) + + assert pipeline_module_test.num_inputs == 2 # Running the pipeline executor in the pipeline mode. pipeline_module_test.run() diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py index afa745760895c..80c2fbaeb416c 100644 --- a/tests/python/unittest/test_link_params.py +++ b/tests/python/unittest/test_link_params.py @@ -264,7 +264,8 @@ def test_c_link_params(linkable_dtype): c_dtype = _get_c_datatype(linkable_dtype) src_lines = src.split("\n") param = param_init[f"{linkable_dtype}_a"].reshape(np.prod(KERNEL_SHAPE)) - param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) constant_\d+\[{np.prod(param.shape)}\] = {{$" + param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) [a-zA-Z_0-9]*constant_\d+\[{np.prod(param.shape)}\] = {{$" + for i, line in enumerate(src_lines): if re.match(param_def, line): i += 1 diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py index d707e6b4646b7..0caae1cdd9d46 100644 --- a/tests/python/unittest/test_micro_model_library_format.py +++ b/tests/python/unittest/test_micro_model_library_format.py @@ -15,15 +15,19 @@ # specific language governing permissions and limitations # under the License. +import pathlib +import sys import datetime import json import os import tarfile -import numpy +import numpy as np import pytest import platform +pytest.importorskip("tvm.micro") + import tvm import tvm.relay from tvm.relay.backend import Executor, Runtime @@ -31,12 +35,14 @@ import tvm.runtime.module import tvm.testing from tvm.contrib import utils +import tvm.micro as micro +from tvm.micro.testing.utils import get_conv2d_relay_module +import tvm.micro.model_library_format as model_library_format +from tvm.micro.model_library_format import _GENERATED_VERSION @tvm.testing.requires_micro def test_export_operator_model_library_format(): - import tvm.micro as micro - target = tvm.target.target.micro("host") with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): A = tvm.te.placeholder((2,), dtype="int8") @@ -63,7 +69,7 @@ def test_export_operator_model_library_format(): with open(os.path.join(extract_dir, "metadata.json")) as json_f: metadata = json.load(json_f) - assert metadata["version"] == 6 + assert metadata["version"] == _GENERATED_VERSION assert metadata["model_name"] == "add" export_datetime = datetime.datetime.strptime( metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ" @@ -95,8 +101,35 @@ def test_export_operator_model_library_format(): assert tir_f.read() == str(ir_mod) +@tvm.testing.requires_micro +def test_export_multiple_operator_model_library_format(): + target = tvm.target.target.micro("host") + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + A = tvm.te.placeholder((2,), dtype="int8") + B = tvm.te.placeholder((1,), dtype="int8") + C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C") + sched = tvm.te.create_schedule(C.op) + mod = tvm.build( + sched, + [A, B, C], + tvm.target.Target(target, target), + runtime=Runtime("crt", {"system-lib": True}), + name="add", + ) + + temp_dir = utils.tempdir() + mlf_tar_path = temp_dir.relpath("lib.tar") + + with pytest.raises(RuntimeError) as exc: + micro.export_model_library_format([mod, mod], mlf_tar_path) + + assert str(exc.exception) == ("Multiple operator is not supported.") + + def validate_graph_json(extract_dir, factory): - with open(os.path.join(extract_dir, "executor-config", "graph", "graph.json")) as graph_f: + with open( + os.path.join(extract_dir, "executor-config", "graph", f"{factory.libmod_name}.graph") + ) as graph_f: graph_json = graph_f.read() assert graph_json == factory.graph_json @@ -141,12 +174,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ executor=executor, runtime=runtime, mod_name="add", - params={"c": numpy.array([[2.0, 4.0]], dtype="float32")}, + params={"c": np.array([[2.0, 4.0]], dtype="float32")}, ) temp_dir = utils.tempdir() mlf_tar_path = temp_dir.relpath("lib.tar") - import tvm.micro as micro micro.export_model_library_format(factory, mlf_tar_path) tf = tarfile.open(mlf_tar_path) @@ -157,21 +189,22 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ with open(os.path.join(extract_dir, "metadata.json")) as json_f: metadata = json.load(json_f) - assert metadata["version"] == 6 - assert metadata["model_name"] == "add" + module_name = factory.libmod_name + assert metadata["version"] == _GENERATED_VERSION + assert metadata["modules"][module_name]["model_name"] == "add" export_datetime = datetime.datetime.strptime( - metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ" + metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ" ) assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) - assert metadata["target"] == [str(target)] + assert metadata["modules"][module_name]["target"] == [str(target)] if str(executor) == "graph": - assert metadata["memory"]["sids"] == [ + assert metadata["modules"][module_name]["memory"]["sids"] == [ {"storage_id": 0, "size_bytes": 2, "input_binding": "a"}, {"storage_id": 1, "size_bytes": 8, "input_binding": "b"}, {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"}, {"storage_id": 3, "size_bytes": 8}, ] - assert metadata["memory"]["functions"]["main"] == [ + assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [ { "constants_size_bytes": json_constants_size_bytes, "device": 1, @@ -179,12 +212,14 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ "workspace_size_bytes": 0, } ] - assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [ - {"device": 1, "workspace_size_bytes": 0} - ] + assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][ + "workspace" + ] == [{"device": 1, "workspace_size_bytes": 0}] assert ( "fused_cast_multiply_add" - in metadata["memory"]["functions"]["operator_functions"][0]["function_name"] + in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][ + "function_name" + ] ) assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "add_lib0.c")) @@ -196,7 +231,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ if str(executor) == "graph": validate_graph_json(extract_dir, factory) - with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f: + with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f: assert relay_f.read() == str(relay_mod) with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f: @@ -227,12 +262,11 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ target, runtime=Runtime("crt", {"system-lib": True}), mod_name="add", - params={"c": numpy.array([[2.0, 4.0]], dtype="float32")}, + params={"c": np.array([[2.0, 4.0]], dtype="float32")}, ) temp_dir = utils.tempdir() mlf_tar_path = temp_dir.relpath("lib.tar") - import tvm.micro as micro micro.export_model_library_format(factory, mlf_tar_path) tf = tarfile.open(mlf_tar_path) @@ -243,20 +277,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ with open(os.path.join(extract_dir, "metadata.json")) as json_f: metadata = json.load(json_f) - assert metadata["version"] == 6 - assert metadata["model_name"] == "add" + module_name = factory.libmod_name + assert metadata["version"] == _GENERATED_VERSION + assert metadata["modules"][module_name]["model_name"] == "add" export_datetime = datetime.datetime.strptime( - metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ" + metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ" ) assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) - assert metadata["target"] == [str(target)] - assert metadata["memory"]["sids"] == [ + assert metadata["modules"][module_name]["target"] == [str(target)] + assert metadata["modules"][module_name]["memory"]["sids"] == [ {"storage_id": 0, "size_bytes": 2, "input_binding": "a"}, {"storage_id": 1, "size_bytes": 8, "input_binding": "b"}, {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"}, {"storage_id": 3, "size_bytes": 8}, ] - assert metadata["memory"]["functions"]["main"] == [ + assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [ { "constants_size_bytes": 8, "device": 1, @@ -264,19 +299,21 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[ "workspace_size_bytes": 0, } ] - assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [ - {"device": 1, "workspace_size_bytes": 0} - ] + assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][ + "workspace" + ] == [{"device": 1, "workspace_size_bytes": 0}] assert ( "fused_cast_multiply_add" - in metadata["memory"]["functions"]["operator_functions"][0]["function_name"] + in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][ + "function_name" + ] ) assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "add_lib0.o")) validate_graph_json(extract_dir, factory) - with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f: + with open(os.path.join(extract_dir, "src", f"{module_name}.relay")) as relay_f: assert relay_f.read() == str(relay_mod) with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f: @@ -314,7 +351,6 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1 temp_dir = utils.tempdir() mlf_tar_path = temp_dir.relpath("lib.tar") - import tvm.micro as micro micro.export_model_library_format(factory, mlf_tar_path) tf = tarfile.open(mlf_tar_path) @@ -325,14 +361,15 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1 with open(os.path.join(extract_dir, "metadata.json")) as json_f: metadata = json.load(json_f) - assert metadata["version"] == 6 - assert metadata["model_name"] == "qnn_conv2d" + module_name = factory.libmod_name + assert metadata["version"] == _GENERATED_VERSION + assert metadata["modules"][module_name]["model_name"] == "qnn_conv2d" export_datetime = datetime.datetime.strptime( - metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ" + metadata["modules"][module_name]["export_datetime"], "%Y-%m-%d %H:%M:%SZ" ) assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) - assert metadata["target"] == [str(target)] - assert metadata["memory"]["functions"]["main"] == [ + assert metadata["modules"][module_name]["target"] == [str(target)] + assert metadata["modules"][module_name]["memory"]["functions"]["main"] == [ { "constants_size_bytes": 0, "device": 1, @@ -340,12 +377,14 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1 "workspace_size_bytes": 2466816, } ] - assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [ - {"device": 1, "workspace_size_bytes": 2466816} - ] + assert metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][ + "workspace" + ] == [{"device": 1, "workspace_size_bytes": 2466816}] assert ( "fused_nn_conv2d_add_fixed_point_multiply_clip_cast" - in metadata["memory"]["functions"]["operator_functions"][0]["function_name"] + in metadata["modules"][module_name]["memory"]["functions"]["operator_functions"][0][ + "function_name" + ] ) @@ -354,11 +393,9 @@ def test_export_non_dso_exportable(): module = tvm.support.FrontendTestModule() temp_dir = utils.tempdir() - import tvm.micro as micro - import tvm.micro.model_library_format as model_library_format with pytest.raises(micro.UnsupportedInModelLibraryFormatError) as exc: - model_library_format._populate_codegen_dir(module, temp_dir.relpath("codegen")) + model_library_format._populate_codegen_dir([module], temp_dir.relpath("codegen")) assert str(exc.exception) == ( "Don't know how to export non-c or non-llvm modules; found: ffi_testing" @@ -408,8 +445,6 @@ def test_export_byoc_c_module(): temp_dir = utils.tempdir() mlf_tar_path = temp_dir.relpath("lib.tar") - from tvm import micro - micro.export_model_library_format(factory, mlf_tar_path) with tarfile.open(mlf_tar_path, "r:*") as tf: @@ -418,7 +453,7 @@ def test_export_byoc_c_module(): assert "./metadata.json" in tar_members with tf.extractfile("./metadata.json") as f: metadata = json.load(f) - main_md = metadata["memory"]["functions"]["main"] + main_md = metadata["modules"][factory.libmod_name]["memory"]["functions"]["main"] if platform.architecture()[0] == "64bit": assert main_md == [ { @@ -439,5 +474,140 @@ def test_export_byoc_c_module(): ] +@tvm.testing.requires_micro +def test_multiple_relay_modules_same_module_name(): + mod = get_conv2d_relay_module() + + executor = Executor("graph") + runtime = Runtime("crt") + target = tvm.target.target.micro("host") + + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod") + factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod") + + temp_dir = utils.tempdir() + mlf_tar_path = temp_dir.relpath("lib.tar") + + with pytest.raises(AssertionError, match="Multiple modules should have unique names"): + micro.export_model_library_format([factory1, factory2], mlf_tar_path) + + +@tvm.testing.requires_micro +def test_multiple_relay_modules_graph(): + mod = get_conv2d_relay_module() + + executor = Executor("graph") + runtime = Runtime("crt") + target = tvm.target.target.micro("host") + + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1") + factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2") + + temp_dir = utils.tempdir() + mlf_tar_path = temp_dir.relpath("lib.tar") + micro.export_model_library_format([factory1, factory2], mlf_tar_path) + + with tarfile.open(mlf_tar_path, "r:*") as tf: + tar_members = [ti.name for ti in tf.getmembers()] + print("tar members", tar_members) + assert "./metadata.json" in tar_members + assert "./codegen/host/src/mod1_lib0.c" in tar_members + assert "./codegen/host/src/mod2_lib0.c" in tar_members + + with tf.extractfile("./metadata.json") as f: + metadata = json.load(f) + mod2_main_md = metadata["modules"]["mod2"]["memory"]["functions"]["main"] + assert mod2_main_md == [ + { + "constants_size_bytes": 0, + "device": 1, + "io_size_bytes": 143960, + "workspace_size_bytes": 158088, + } + ] + assert metadata["modules"]["mod1"]["model_name"] == "mod1" + assert metadata["modules"]["mod2"]["model_name"] == "mod2" + + +@tvm.testing.requires_micro +def test_multiple_relay_modules_c(): + mod = get_conv2d_relay_module() + + executor = Executor("aot", {"unpacked-api": True, "interface-api": "c"}) + runtime = Runtime("crt") + target = tvm.target.target.micro("host") + + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + factory1 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod1") + factory2 = tvm.relay.build(mod, target, runtime=runtime, executor=executor, mod_name="mod2") + + temp_dir = utils.tempdir() + mlf_tar_path = temp_dir.relpath("lib.tar") + + micro.export_model_library_format([factory1, factory2], mlf_tar_path) + + tf = tarfile.open(mlf_tar_path) + + extract_dir = temp_dir.relpath("extract") + os.mkdir(extract_dir) + tf.extractall(extract_dir) + + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c")) + + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod1.h")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h")) + + # check CRT runtime directory + assert os.path.exists(os.path.join(extract_dir, "runtime")) + + +@tvm.testing.requires_micro +def test_multiple_relay_modules_aot_graph(): + mod = get_conv2d_relay_module() + + executor1 = Executor("graph") + executor2 = Executor("aot", {"unpacked-api": True, "interface-api": "c"}) + runtime = Runtime("crt") + target = tvm.target.target.micro("host") + + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + factory1 = tvm.relay.build( + mod, target, runtime=runtime, executor=executor1, mod_name="mod1" + ) + factory2 = tvm.relay.build( + mod, target, runtime=runtime, executor=executor2, mod_name="mod2" + ) + + temp_dir = utils.tempdir() + mlf_tar_path = temp_dir.relpath("lib.tar") + + micro.export_model_library_format([factory1, factory2], mlf_tar_path) + + tf = tarfile.open(mlf_tar_path) + extract_dir = temp_dir.relpath("extract") + os.mkdir(extract_dir) + tf.extractall(extract_dir) + + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib0.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib1.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod1_lib2.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib0.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "mod2_lib1.c")) + + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_mod2.h")) + + with open(os.path.join(extract_dir, "metadata.json")) as f: + metadata = json.load(f) + + assert metadata["modules"]["mod1"]["executors"] == ["graph"] + assert metadata["modules"]["mod2"]["executors"] == ["aot"] + assert metadata["version"] == _GENERATED_VERSION + + if __name__ == "__main__": - tvm.testing.main() + sys.exit(pytest.main([__file__] + sys.argv[1:]))