diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc index 998393d450c24..550afb3159fc0 100644 --- a/src/relay/backend/contrib/codegen_c/codegen.cc +++ b/src/relay/backend/contrib/codegen_c/codegen.cc @@ -157,8 +157,7 @@ class CodegenC : public MemoizedExprTranslator>, public Code for (size_t i = 0; i < out_shape.size(); ++i) { out_size *= out_shape[i]; } - buf_stream << dtype << "* " << out << " = (" << dtype << "*)std::malloc(4 * " << out_size - << ");"; + buf_stream << dtype << "* " << out << " = (" << dtype << "*)malloc(4 * " << out_size << ");"; buf_decl_.push_back(buf_stream.str()); decl_stream << ", " << out << ");"; @@ -229,25 +228,33 @@ class CSourceCodegen : public CSourceModuleCodegenBase { String func_name = std::get<1>(res); // Create headers - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; + code_stream_ << "#include \n"; + code_stream_ << "#include \n"; + code_stream_ << "#include \n"; code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "using namespace tvm::runtime;\n"; + code_stream_ << "#include \n"; + if (!variables.empty()) { + // This segment would be generated in C++ because of the usage + // of tvm::runtime::Array. This is not ideal, but this to demonstrate + // constant copying process used packed imports in other external + // codegen. Moreover, in uTVM we dont expect this part to be generated. + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "#include \n"; + code_stream_ << "#include \n"; + code_stream_ << "#endif\n"; + } // Append some common macro for operator definition. const char* operator_macro = R"op_macro( #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE) \ - extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ + void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ for (int64_t i = 0; i < p_DIM1_; ++i) { \ out[i] = a[i] p_OP_ b[i]; \ } \ } #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_, p_DTYPE) \ - extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ + void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ for (int64_t i = 0; i < p_DIM1_; ++i) { \ for (int64_t j = 0; j < p_DIM2_; ++j) { \ int64_t k = i * p_DIM2_ + j; \ diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h index 9448b4d0738d2..af835cfca02e8 100644 --- a/src/relay/backend/contrib/codegen_c/codegen_c.h +++ b/src/relay/backend/contrib/codegen_c/codegen_c.h @@ -89,6 +89,40 @@ class CodegenCBase { indent_ -= 2; } + /*! + * \brief Creates a runtime function header + */ + void PrintRuntimeFunctionHeader(std::string func_name) { + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "extern \"C\" {\n"; + code_stream_ << "#endif\n"; + code_stream_ << "TVM_DLL int32_t "; + code_stream_ << func_name << "("; + code_stream_ << "TVMValue* args, "; + code_stream_ << "int* type_code, "; + code_stream_ << "int num_args, "; + code_stream_ << "TVMValue* out_value, "; + code_stream_ << "int* out_type_code) {\n"; + } + + /*! + * \brief Adds a line to convert TVMValue args to DLTensors + */ + void PrintArgToData(int idx) { + PrintIndents(); + code_stream_ << "DLTensor* arg" << idx << " = "; + code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n"; + } + + /*! + * \brief Adds a line to convert TVMValue rets to DLTensors + */ + void PrintRetToData(int idx) { + PrintIndents(); + code_stream_ << "DLTensor* ret" << idx << " = "; + code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n"; + } + /*! * \brief Gerenate C code for the external function. * @@ -100,12 +134,12 @@ class CodegenCBase { * Array foo_consts; * * // An example code for the generated C function. - * extern "C" int foo_wrapper_(DLTensor* arg0, + * int foo_wrapper_(DLTensor* arg0, * DLTensor* arg1, * DLTensor* out) { - * foo_(static_cast(arg0->data), - * static_cast(arg1->data), - * static_cast(out->data)); + * foo_((float*)(arg0->data), + * (float*)(arg1->data), + * (float*)(out->data)); * return 0; * } * @@ -124,7 +158,8 @@ class CodegenCBase { const std::string& const_arr_name, const std::vector& outs) { // Print signature code_stream_ << "\n"; - code_stream_ << "extern \"C\" int " << func_name << "_wrapper_("; + + code_stream_ << "int " << func_name << "_wrapper_("; for (size_t i = 0; i < args.size(); i++) { code_stream_ << "DLTensor* arg" << i << ",\n"; code_stream_ << "\t"; @@ -142,26 +177,54 @@ class CodegenCBase { code_stream_ << func_name << "_("; for (size_t i = 0; i < args.size(); i++) { const auto& dtype_str = GetDtypeString(args[i]); - code_stream_ << "static_cast<" << dtype_str << "*>(arg" << i << "->data),\n"; + code_stream_ << "(" << dtype_str << "*)(arg" << i << "->data),\n"; PrintIndents(); } for (size_t i = 0; i < outs.size() - 1; i++) { - code_stream_ << "static_cast<" << outs[i].dtype << "*>(out" << i << "->data),\n"; + code_stream_ << "(" << outs[i].dtype << "*)(out" << i << "->data),\n"; PrintIndents(); } - code_stream_ << "static_cast<" << outs.back().dtype << "*>(out" << outs.size() - 1 - << "->data));\n"; + code_stream_ << "(" << outs.back().dtype << "*)(out" << outs.size() - 1 << "->data));\n"; PrintIndents(); code_stream_ << "return 0;\n"; ExitScope(); code_stream_ << "}\n\n"; - // Generate the macro - code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(" << func_name << ", " << func_name - << "_wrapper_);\n\n"; + // Create the external function + PrintRuntimeFunctionHeader(func_name); + EnterScope(); + for (size_t i = 0; i < args.size(); i++) { + PrintArgToData(i); + } + for (size_t i = 0; i < outs.size(); i++) { + PrintRetToData(args.size() + i); + } + PrintIndents(); + code_stream_ << func_name << "_wrapper_("; + for (size_t i = 0; i < args.size(); i++) { + code_stream_ << "arg" << i << ","; + } + for (size_t i = 0; i < outs.size() - 1; i++) { + code_stream_ << "ret" << args.size() + i << ","; + } + code_stream_ << "ret" << args.size() + outs.size() - 1 << ");\n"; + PrintIndents(); + code_stream_ << "return 0;\n"; + ExitScope(); + code_stream_ << "}\n"; + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "}\n"; + code_stream_ << "#endif\n"; if (!const_arr_name.empty()) { - code_stream_ << "int " << func_name << "_init_wrapper_(Array arr) {\n"; + // If there are constants, insert the __init_ and the wrapper + // This segment would be generated in C++ because of the usage + // of tvm::runtime::Array. This is not ideal, but this to demonstrate + // constant copying process used packed imports in other external + // codegen. Moreover, in uTVM we dont expect this part to be generated. + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "int " << func_name + << "_init_wrapper_(tvm::runtime::Array arr) {\n"; EnterScope(); PrintIndents(); code_stream_ << func_name << "_consts = arr;\n"; @@ -170,6 +233,7 @@ class CodegenCBase { code_stream_ << "}\n\n"; code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(__init_" << func_name << ", " << func_name << "_init_wrapper_);\n\n"; + code_stream_ << "#endif\n"; } } @@ -202,11 +266,13 @@ class CodegenCBase { const std::vector& outs) { // Create a declaration for global ndarrays that contain constant data. if (!const_arr_name.empty()) { + code_stream_ << "#ifdef __cplusplus\n"; code_stream_ << const_arr_name << "\n\n"; + code_stream_ << "#endif\n"; } // Create the signature. For example, it could be: - // extern "C" void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {} - code_stream_ << "extern \"C\" void " << ext_func_id << "_("; + // void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {} + code_stream_ << "void " << ext_func_id << "_("; for (const auto& arg : args) { const auto& dtype_str = GetDtypeString(arg); @@ -235,14 +301,14 @@ class CodegenCBase { continue; } this->PrintIndents(); - code_stream_ << "std::memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size + code_stream_ << "memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size << ");\n"; } // Free buffers for (size_t i = 0; i < buf_decl.size(); i++) { this->PrintIndents(); - code_stream_ << "std::free(buf_" << i << ");\n"; + code_stream_ << "free(buf_" << i << ");\n"; } this->ExitScope(); @@ -310,7 +376,7 @@ class CodegenCBase { * \return The created declaration */ std::string CreateNDArrayPool(const std::string& symbol) const { - return "Array " + symbol + "_consts;"; + return "tvm::runtime::Array " + symbol + "_consts;"; } /*! @@ -322,7 +388,7 @@ class CodegenCBase { * \return The created reference */ std::string CreateDataReference(const std::string& symbol, int const_id) const { - return "static_cast(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)"; + return "(float*)(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)"; } /*! diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py index 1c38c2dcd187f..ab3a25d36543b 100644 --- a/tests/micro/qemu/test_zephyr.py +++ b/tests/micro/qemu/test_zephyr.py @@ -33,6 +33,8 @@ from tvm.micro.contrib import zephyr from tvm.contrib import utils +from tvm.relay.expr_functor import ExprMutator +from tvm.relay.op.annotation import compiler_begin, compiler_end BUILD = True DEBUG = False @@ -198,5 +200,143 @@ def test_relay(platform): tvm.testing.assert_allclose(result, x_in * x_in + 1) +class CcompilerAnnotator(ExprMutator): + """ + This is used to create external functions for ccompiler. + A simple annotator that creates the following program: + | + -- begin -- + | + add + | + subtract + | + multiply + | + -- end -- + | + """ + + def __init__(self): + super(CcompilerAnnotator, self).__init__() + self.in_compiler = 0 + + def visit_call(self, call): + if call.op.name == "add": # Annotate begin at args + if self.in_compiler == 1: + lhs = compiler_begin(super().visit(call.args[0]), "ccompiler") + rhs = compiler_begin(super().visit(call.args[1]), "ccompiler") + op = relay.add(lhs, rhs) + self.in_compiler = 2 + return op + elif call.op.name == "subtract": + if self.in_compiler == 1: + lhs = super().visit(call.args[0]) + rhs = super().visit(call.args[1]) + if isinstance(lhs, relay.expr.Var): + lhs = compiler_begin(lhs, "ccompiler") + if isinstance(rhs, relay.expr.Var): + rhs = compiler_begin(rhs, "ccompiler") + return relay.subtract(lhs, rhs) + elif call.op.name == "multiply": # Annotate end at output + self.in_compiler = 1 + lhs = super().visit(call.args[0]) + rhs = super().visit(call.args[1]) + if isinstance(lhs, relay.expr.Var): + lhs = compiler_begin(lhs, "ccompiler") + if isinstance(rhs, relay.expr.Var): + rhs = compiler_begin(rhs, "ccompiler") + op = relay.multiply(lhs, rhs) + if self.in_compiler == 2: + op = compiler_end(op, "ccompiler") + self.in_compiler = 0 + return op + return super().visit_call(call) + + +def check_result(relay_mod, model, zephyr_board, map_inputs, out_shape, result): + """Helper function to verify results""" + TOL = 1e-5 + target = tvm.target.target.micro(model) + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + graph, mod, params = tvm.relay.build(relay_mod, target=target) + + with _make_session(model, target, zephyr_board, mod) as session: + rt_mod = tvm.micro.create_local_graph_runtime( + graph, session.get_system_lib(), session.context + ) + rt_mod.set_input(**params) + for name, data in map_inputs.items(): + rt_mod.set_input(name, data) + rt_mod.set_input(**params) + rt_mod.run() + + out_shapes = out_shape if isinstance(out_shape, list) else [out_shape] + results = result if isinstance(result, list) else [result] + + for idx, shape in enumerate(out_shapes): + out = tvm.nd.empty(shape, ctx=session.context) + out = rt_mod.get_output(idx, out) + tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=TOL, atol=TOL) + + +def test_byoc_utvm(platform): + """This is a simple test case to check BYOC capabilities of uTVM""" + model, zephyr_board = PLATFORMS[platform] + x = relay.var("x", shape=(10, 10)) + w0 = relay.var("w0", shape=(10, 10)) + w1 = relay.var("w1", shape=(10, 10)) + w2 = relay.var("w2", shape=(10, 10)) + w3 = relay.var("w3", shape=(10, 10)) + w4 = relay.var("w4", shape=(10, 10)) + w5 = relay.var("w5", shape=(10, 10)) + w6 = relay.var("w6", shape=(10, 10)) + w7 = relay.var("w7", shape=(10, 10)) + + # C compiler + z0 = relay.add(x, w0) + p0 = relay.subtract(z0, w1) + q0 = relay.multiply(p0, w2) + + z1 = relay.add(x, w3) + p1 = relay.subtract(z1, w4) + q1 = relay.multiply(p1, w5) + + # Other parts on TVM + z2 = relay.add(x, w6) + q2 = relay.subtract(z2, w7) + + r = relay.concatenate((q0, q1, q2), axis=0) + f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r) + mod = tvm.IRModule() + ann = CcompilerAnnotator() + mod["main"] = ann.visit(f) + mod = tvm.relay.transform.PartitionGraph()(mod) + mod = tvm.relay.transform.InferType()(mod) + + x_data = np.random.rand(10, 10).astype("float32") + w_data = [] + for _ in range(8): + w_data.append(np.random.rand(10, 10).astype("float32")) + + map_inputs = {"w{}".format(i): w_data[i] for i in range(8)} + map_inputs["x"] = x_data + check_result( + relay_mod=mod, + map_inputs=map_inputs, + out_shape=(30, 10), + result=np.concatenate( + ( + ((x_data + w_data[0]) - w_data[1]) * w_data[2], + ((x_data + w_data[3]) - w_data[4]) * w_data[5], + x_data + w_data[6] - w_data[7], + ), + axis=0, + ), + model=model, + zephyr_board=zephyr_board, + ) + + if __name__ == "__main__": sys.exit(pytest.main([os.path.dirname(__file__)] + sys.argv[1:]))