From 04399f33cf65412d05c8eb3159004dfa833b64a8 Mon Sep 17 00:00:00 2001 From: Luke Hutton Date: Wed, 15 Jul 2020 11:10:01 +0100 Subject: [PATCH] Address comments * correct mistakes in tutorial * reshuffle runtime to use fewer macro blocks * preprocess module using "optimize" functionality * use new module api Change-Id: I219488e617e5767edd7489b43b8bfce876cd24b8 --- docs/deploy/arm_compute_lib.rst | 19 +- .../tvm/relay/op/contrib/arm_compute_lib.py | 2 +- .../contrib/arm_compute_lib/codegen.cc | 24 +-- .../contrib/arm_compute_lib/codegen_acl.h | 15 +- .../contrib/arm_compute_lib/acl_allocator.cc | 3 +- .../contrib/arm_compute_lib/acl_allocator.h | 1 - .../contrib/arm_compute_lib/acl_runtime.cc | 190 +++++------------- .../contrib/arm_compute_lib/acl_utils.h | 2 +- .../test_arm_compute_lib/infrastructure.py | 9 +- .../test_arm_compute_lib/test_conv2d.py | 2 +- .../test_arm_compute_lib/test_network.py | 2 +- .../test_arm_compute_lib/test_pooling.py | 2 +- .../test_arm_compute_lib/test_reshape.py | 2 +- .../test_arm_compute_lib/test_runtime.py | 4 +- 14 files changed, 78 insertions(+), 199 deletions(-) diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index 6902bb41c6b1..9ee9c83d7d7b 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -39,13 +39,15 @@ runtime module on an x86 machine. These flags can be used in different scenarios depending on your setup. For example, if you want to compile ACL on an x86 machine and then run the module on a remote Arm device via RPC, you will -need to use USE_ACL=ON on the x86 machine and USE_GRAPH_RUNTIME_ACL=ON on the remote AArch64 -device. +need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME=ON on the remote +AArch64 device. Usage ----- -*Note:* this section may not stay up-to-date with changes to the API. +.. note:: + + This section may not stay up-to-date with changes to the API. Create a relay graph. This may be a single operator or a whole graph. The intention is that any relay graph can be input. The ACL integration will only pick supported operators to be offloaded @@ -84,7 +86,7 @@ Build the Relay graph. target = "llvm -mtriple=aarch64-linux-gnu -mattr=+neon" with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): - json, lib, params = relay.build(module, target=target) + lib = relay.build(module, target=target) Export the module. @@ -96,16 +98,17 @@ Export the module. lib.export_library(lib_path, cc=cross_compile) -Run Inference. This must be on an Arm device. If compiling on x86 device and running on aarch64 +Run Inference. This must be on an Arm device. If compiling on x86 device and running on aarch64, consider using the RPC mechanism. .. code:: python - tvm.runtime.load_module('lib_acl.so') - gen_module = tvm.contrib.graph_runtime.create(json, lib, ctx) + ctx = tvm.cpu(0) + loaded_lib = tvm.runtime.load_module('lib_acl.so') + gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx)) d_data = np.random.uniform(0, 1, data_shape).astype(data_type) map_inputs = {'data': d_data} - gen_module.map_inputs(**map_inputs) + gen_module.set_input(**map_inputs) gen_module.run() diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index 3cb999b59a45..c13e3e8a53c1 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name, unused-argument -"""ACL library supported operators.""" +"""Arm Compute Library supported operators.""" import tvm from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc index a890d297245c..61f1bb4a1b13 100644 --- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc +++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc @@ -60,11 +60,6 @@ std::vector ACLJSONSerializer::VisitExpr_(const CallNode* cn return AddNode(json_node, GetRef(cn)); } -std::vector ACLJSONSerializer::VisitExpr_(const ConstantNode* cn) { - this->constants_.push_back(cn->data); - return JSONSerializer::VisitExpr_(cn); -} - std::shared_ptr ACLJSONSerializer::CreateOpJSONNode(const CallNode* cn) { const auto* op = cn->op.as(); CHECK(op); @@ -148,37 +143,28 @@ std::shared_ptr ACLJSONSerializer::CreateCompositeConvJSONNode(co return json_node; } -Array ACLJSONSerializer::GetParamsData() { return constants_; } - IRModule PreProcessModule(const IRModule& mod) { IRModule preprocessed_module; - tvm::Map> desired_layouts = { - {"nn.conv2d", {String("NHWC"), String("OHWI")}}}; + tvm::Map> desired_layouts = {{"nn.conv2d", {"NHWC", "OHWI"}}}; preprocessed_module = transform::ConvertLayout(desired_layouts)(mod); preprocessed_module = transform::FoldConstant()(preprocessed_module); return preprocessed_module; } +TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib.optimize").set_body_typed(PreProcessModule); + runtime::Module ACLCompiler(const ObjectRef& ref) { CHECK(ref->IsInstance()) << "The input ref is expected to be a Relay function."; Function func = Downcast(ref); std::string func_name = backend::GetExtSymbol(func); - IRModule mod; - mod->Add(GlobalVar(func_name), func); - mod = PreProcessModule(mod); - - CHECK(mod->functions.size() == 1) << "Module should only contain single function"; - Function processed_func = Downcast(mod->functions.begin().operator*().second); - - ACLJSONSerializer serializer(func_name, processed_func); + ACLJSONSerializer serializer(func_name, func); serializer.serialize(); std::string graph_json = serializer.GetJSON(); auto param_names = serializer.GetParams(); - auto param_data = serializer.GetParamsData(); const auto* pf = runtime::Registry::Get("runtime.arm_compute_lib_runtime_create"); CHECK(pf != nullptr) << "Cannot find JSON runtime module to create"; - runtime::Module lib = (*pf)(func_name, graph_json, param_names, param_data); + runtime::Module lib = (*pf)(func_name, graph_json, param_names); return lib; } diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen_acl.h b/src/relay/backend/contrib/arm_compute_lib/codegen_acl.h index 5e6e1faa5a26..a651a75e292f 100644 --- a/src/relay/backend/contrib/arm_compute_lib/codegen_acl.h +++ b/src/relay/backend/contrib/arm_compute_lib/codegen_acl.h @@ -55,15 +55,6 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { ACLJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {} std::vector VisitExpr_(const CallNode* cn) override; - std::vector VisitExpr_(const ConstantNode* cn) override; - - /*! - * \brief Get the constant data transposed when pre-processing the - * input function. - * - * \return An array of constants - */ - Array GetParamsData(); private: /*! @@ -74,10 +65,6 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { */ std::shared_ptr CreateOpJSONNode(const CallNode* cn); std::shared_ptr CreateCompositeConvJSONNode(const CallNode* cn); - - /* \brief Transposed constant tensors to serialize. Arm Compute Library expects constant tensors - * in OHWI format. */ - Array constants_; }; /*! @@ -98,7 +85,7 @@ IRModule PreProcessModule(const IRModule& mod); * one another. Each function consists of serialized JSON describing the sub-graph * and serialized constant tensors. * - * \note The ACL runtime module only currently supports a single operator per + * \note The ACL runtime module only supports a single operator per * sub-graph currently. * * \param ref The ext_func Relay expression/module to be executed using extern ops. diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc index 18372dcde100..2713073658ee 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_allocator.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.cc @@ -64,8 +64,7 @@ ACLMemoryRegion::~ACLMemoryRegion() { std::unique_ptr ACLMemoryRegion::extract_subregion(size_t offset, size_t size) { if (this->ptr_ != nullptr && (offset < _size) && (_size - offset >= size)) { - return arm_compute::support::cpp14::make_unique( - static_cast(this->ptr_) + offset, size); + return std::make_unique(static_cast(this->ptr_) + offset, size); } else { return nullptr; } diff --git a/src/runtime/contrib/arm_compute_lib/acl_allocator.h b/src/runtime/contrib/arm_compute_lib/acl_allocator.h index 407092894a44..26e413e1d08e 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_allocator.h +++ b/src/runtime/contrib/arm_compute_lib/acl_allocator.h @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc index 6e6496f86972..951cd0be6135 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc @@ -47,17 +47,6 @@ using namespace tvm::runtime::json; #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB using namespace arm_compute_lib; - -/*! - * \brief ACL objects we cache in order to avoid needing to construct - * a new layer each time. - */ -struct CachedLayer { - std::shared_ptr function; - std::vector inputs; - std::vector const_inputs; - std::vector outputs; -}; #endif class ACLRuntime : public JSONRuntimeBase { @@ -69,128 +58,46 @@ class ACLRuntime : public JSONRuntimeBase { * \param symbol_name The name of the function. * \param graph_json serialized JSON representation of a sub-graph. * \param const_names The names of each constant in the sub-graph. - * \params consts An array of constants pre-transposed to the correct layout expected by ACL. */ explicit ACLRuntime(const std::string& symbol_name, const std::string& graph_json, - const Array& const_names, const Array& consts) - : JSONRuntimeBase(symbol_name, graph_json, const_names) { - this->constants_ = consts; - } + const Array& const_names) + : JSONRuntimeBase(symbol_name, graph_json, const_names) {} /*! - * \brief Get a packed function. + * \brief The type key of the module. * - * \param name The name/symbol of the function. - * \param sptr_to_self The pointer to the module node. - * \return The packed function. + * \return module type key. */ - PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) override { - if (name == "get_symbol") { - return PackedFunc( - [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_name_; }); - } else if (name == "get_const_vars") { - return PackedFunc( - [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->const_names_; }); - } else if (this->symbol_name_ == name) { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - CHECK(this->initialized_) << "The module has not been initialized"; - - // Bind argument tensors to data entries. - this->SetInputOutputBuffers(args); - // Execute the subgraph. - this->Run(); - }); - } else if ("__init_" + this->symbol_name_ == name) { - // The function to initialize constant tensors. - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - this->Init(); - this->initialized_ = true; - *rv = 0; - }); - } else { - return PackedFunc(nullptr); - } - } + const char* type_key() const override { return "arm_compute_lib"; } /*! - * \brief Save a compiled network to a binary stream, which can then be - * serialized to disk. + * \brief Initialize runtime. Create ACL layer from JSON + * representation. * - * \param stream The stream to save the binary. + * \param consts The constant params from compiled model. */ - void SaveToBinary(dmlc::Stream* stream) override { - // Save the symbol - stream->Write(symbol_name_); - // Save the graph - stream->Write(graph_json_); - // Save the required const names - std::vector const_names; - for (const auto& it : const_names_) { - const_names.push_back(it); - } - stream->Write(const_names); - // Save the required constant data - stream->Write(constants_.size()); - for (const auto& it : constants_) { - it.Save(stream); - } + void Init(const Array& consts) override { + CHECK_EQ(consts.size(), const_idx_.size()) + << "The number of input constants must match the number of required."; + SetupConstants(consts); + BuildEngine(); } /*! - * \brief Load a compiled network from stream. + * \brief Get the JSON generated by codegen. * - * \param strm The binary stream to load. - * \return The created ACL module. + * \param format the format to return (only JSON for the time being) + * \return A string of JSON. */ - static Module LoadFromBinary(void* strm) { - dmlc::Stream* stream = static_cast(strm); - std::string symbol; - std::string graph_json; - std::vector consts; - // Load the symbol - CHECK(stream->Read(&symbol)) << "Loading symbol name failed"; - CHECK(stream->Read(&graph_json)) << "Loading graph json failed"; - CHECK(stream->Read(&consts)) << "Loading the const name list failed"; - Array const_names; - for (const auto& it : consts) { - const_names.push_back(it); - } - size_t const_data_count; - CHECK(stream->Read(&const_data_count)); - Array const_data; - for (size_t i = 0; i < const_data_count; ++i) { - runtime::NDArray temp; - CHECK(temp.Load(stream)) << "Failed to load constant"; - const_data.push_back(temp); + std::string GetSource(const std::string& format) override { + if (format == "json") { + return graph_json_; } - auto n = make_object(symbol, graph_json, const_names, const_data); - return Module(n); + LOG(FATAL) << "Format not supported by Arm Compute Library runtime."; + return ""; } - /*! - * \brief The type key of the module. - * - * \return module type key. - */ - const char* type_key() const override { return "arm_compute_lib"; } - - /*! - * \brief Initialize runtime. Create ACL layer from JSON - * representation. - */ - void Init() { - CHECK_EQ(this->constants_.size(), const_idx_.size()) - << "The number of input constants must match the number expected."; - this->SetupConstants(this->constants_); #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB - BuildEngine(); -#endif - } - - // Do not accept constants from MetadataModule as they should be transposed - // by the ACL codegen so they have the correct expected layout. - void Init(const Array& constants) override { LOG(FATAL) << "Not implemented."; } - /*! * \brief Unpack inputs and outputs and run inference on a given layer. * @@ -199,7 +106,6 @@ class ACLRuntime : public JSONRuntimeBase { * \return Status of inference. */ void Run() override { -#ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB for (size_t i = 0; i < input_nodes_.size(); ++i) { auto nid = input_nodes_[i]; uint32_t eid = EntryID(nid, 0); @@ -216,28 +122,9 @@ class ACLRuntime : public JSONRuntimeBase { } this->layer_.function->run(); -#else - LOG(FATAL) << "Cannot call run on Arm Compute Library module without runtime enabled. " - << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME."; -#endif - } - - /*! - * \brief Get the JSON generated by codegen. - * - * \param format the format to return (only JSON for the time being) - * \return A string of JSON. - */ - std::string GetSource(const std::string& format) override { - if (format == "json") { - return graph_json_; - } - LOG(FATAL) << "Format not supported by Arm Compute Library runtime."; - return ""; } private: -#ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB /*! * \brief Build ACL layer from JSON representation and cache. * @@ -287,6 +174,17 @@ class ACLRuntime : public JSONRuntimeBase { if (num_pools > 0) mm->populate(this->allocator_, num_pools); } + /*! + * \brief ACL objects we cache in order to avoid needing to construct + * a new layer each time. + */ + struct CachedLayer { + std::shared_ptr function; + std::vector inputs; + std::vector const_inputs; + std::vector outputs; + }; + /*! * \brief Create a 2D convolution layer. * @@ -378,25 +276,33 @@ class ACLRuntime : public JSONRuntimeBase { /*! \brief Allow ACL functions to request auxiliary memory from TVM. */ arm_compute_lib::ACLAllocator allocator_; - /*! \brief The network layers represented by acl functions. Note: currently only supports a single - * layer. */ + /*! + * \brief The network layers represented by acl functions. + * \note Currently only supports a single layer. + */ CachedLayer layer_; -#endif +#else + void Run() override { + LOG(FATAL) << "Cannot call run on Arm Compute Library module without runtime enabled. " + << "Please build with USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME."; + } - /*! \brief Array of pre-transposed constants from ACL codegen. */ - Array constants_; + void BuildEngine() { + // Do nothing. + } +#endif }; runtime::Module ACLRuntimeCreate(const String& symbol_name, const String& graph_json, - const Array& const_names, const Array& consts) { - auto n = make_object(symbol_name, graph_json, const_names, consts); + const Array& const_names) { + auto n = make_object(symbol_name, graph_json, const_names); return runtime::Module(n); } TVM_REGISTER_GLOBAL("runtime.arm_compute_lib_runtime_create").set_body_typed(ACLRuntimeCreate); TVM_REGISTER_GLOBAL("runtime.module.loadbinary_arm_compute_lib") - .set_body_typed(ACLRuntime::LoadFromBinary); + .set_body_typed(JSONRuntimeBase::LoadFromBinary); } // namespace contrib } // namespace runtime diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h index 41de7b58302f..de35c307caa0 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_utils.h +++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h @@ -97,7 +97,7 @@ arm_compute::TensorShape MakeTensorShape(const std::vector& shape); */ std::shared_ptr MakeMemoryManager(); -/* +/*! * \brief Convert TVM padding and stride format to acl PadStrideInfo. * * \param pad The pad vector. diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py index a8974783f20c..2f2cf010c7bc 100644 --- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py +++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py @@ -99,11 +99,10 @@ def build_module(mod, target, params=None, enable_acl=True): def build_and_run(mod, inputs, outputs, params, device, enable_acl=True, no_runs=1): """Build and run the relay module.""" - graph, lib, params = build_module(mod, device.target, params, enable_acl) + lib = build_module(mod, device.target, params, enable_acl) lib = update_lib(lib, device.device, device.cross_compile) - gen_module = graph_runtime.create(graph, lib, ctx=device.device.cpu(0)) + gen_module = graph_runtime.GraphModule(lib['default'](device.device.cpu(0))) gen_module.set_input(**inputs) - gen_module.set_input(**params) for _ in range(no_runs): gen_module.run() out = [gen_module.get_output(i) for i in range(outputs)] @@ -138,13 +137,13 @@ def verify(answers, atol, rtol): def extract_acl_modules(module): """Get the ACL module(s) from llvm module.""" return list(filter(lambda mod: mod.type_key == "arm_compute_lib", - module.imported_modules)) + module.lib.imported_modules)) def verify_codegen(module, known_good_codegen, num_acl_modules, target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon"): """Check acl codegen against a known good output.""" - _, module, _ = build_module(module, target) + module = build_module(module, target) acl_modules = extract_acl_modules(module) assert len(acl_modules) == num_acl_modules, \ diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py index f09589e54f3b..fffb6b09f455 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py +++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""ACL Integration conv2d tests.""" +"""Arm Compute Library integration conv2d tests.""" import numpy as np diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py index 9b8ff0088b5a..4a5206808126 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_network.py +++ b/tests/python/contrib/test_arm_compute_lib/test_network.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""ACL network tests.""" +"""Arm Compute Library network tests.""" import numpy as np diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py index bb3758ab91c3..e3ba80efede7 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py +++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""ACL Integration pooling tests.""" +"""Arm Compute Library integration pooling tests.""" import numpy as np diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py index 0d0656eaa760..9ba13c98ffee 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py +++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""ACL Integration reshape tests.""" +"""Arm Compute Library integration reshape tests.""" import numpy as np diff --git a/tests/python/contrib/test_arm_compute_lib/test_runtime.py b/tests/python/contrib/test_arm_compute_lib/test_runtime.py index 7e4714bafbcb..5ab8699c4a93 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_runtime.py +++ b/tests/python/contrib/test_arm_compute_lib/test_runtime.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""ACL runtime tests.""" +"""Arm Compute Library runtime tests.""" import numpy as np @@ -27,7 +27,7 @@ def test_multiple_ops(): """ - Test multiple operators destined for acl. + Test multiple operators destined for ACL. ACL will expect these ops as in 2 separate functions. """ if skip_runtime_test():