diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py index cacd10de2865e..d253544d45d97 100644 --- a/python/tvm/relay/op/contrib/clml.py +++ b/python/tvm/relay/op/contrib/clml.py @@ -23,7 +23,7 @@ from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name -from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item +from ...dataflow_pattern import wildcard, is_op, is_constant, is_tuple_get_item, is_tuple from .register import register_pattern_table from ..strategy.generic import is_depthwise_conv2d @@ -135,6 +135,7 @@ def conv_pattern(): """Create a convolution pattern.""" pattern = is_op("nn.conv2d")(wildcard(), is_constant()) pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant())) + pattern = pattern.optional(lambda x: is_op("add")(x, is_constant())) pattern = pattern.optional( lambda x: is_op("nn.batch_norm")( x, is_constant(), is_constant(), is_constant(), is_constant() @@ -142,6 +143,7 @@ def conv_pattern(): ) pattern = pattern.optional(is_tuple_get_item) pattern = pattern.optional(is_op("nn.relu")) + pattern = pattern.optional(is_op("clip")) return pattern def batch_norm_pattern(): @@ -152,10 +154,24 @@ def batch_norm_pattern(): pattern = is_tuple_get_item(pattern) return pattern + def concat_pattern(): + """Create a concat pattern. + + Returns + ------- + pattern : dataflow_pattern.AltPattern + Denotes the concat pattern. + """ + pattern = is_tuple(None) + pattern = is_op("concatenate")(pattern) + + return pattern + def dense_pattern(): """Create a dense pattern.""" pattern = is_op("nn.dense")(wildcard(), is_constant()) pattern = pattern.optional(lambda x: is_op("add")(x, is_constant())) + pattern = pattern.optional(lambda x: is_op("nn.bias_add")(x, is_constant())) return pattern def pad_pattern(): @@ -172,6 +188,13 @@ def check_conv(extract): call = call.args[0] if isinstance(call, tvm.relay.expr.TupleGetItem): call = call.tuple_value + elif call.op.name == "clip": + if call.attrs["a_min"] != 0.0 or call.attrs["a_max"] != 6.0: + return False + call = call.args[0] + if isinstance(call, tvm.relay.expr.TupleGetItem): + call = call.tuple_value + while call.op.name != "nn.conv2d": call = call.args[0] attrs, args = call.attrs, call.args @@ -194,6 +217,7 @@ def check_conv(extract): ("clml.conv2d", conv_pattern(), check_conv), ("clml.dense", dense_pattern()), ("clml.pad", pad_pattern()), + ("clml.concat", concat_pattern()), ("clml.batch_norm", batch_norm_pattern()), ] @@ -207,11 +231,18 @@ def _func_wrapper(expr): _register_external_op_helper("clip") -_register_external_op_helper("relu") +_register_external_op_helper("nn.relu") _register_external_op_helper("nn.global_avg_pool2d") _register_external_op_helper("nn.global_max_pool2d") +_register_external_op_helper("nn.avg_pool2d") +_register_external_op_helper("nn.max_pool2d") _register_external_op_helper("nn.softmax") _register_external_op_helper("reshape") +_register_external_op_helper("add") +_register_external_op_helper("subtract") +_register_external_op_helper("multiply") +_register_external_op_helper("minimum") +_register_external_op_helper("maximum") class OpAttrContext(object): diff --git a/src/relay/backend/contrib/clml/codegen.cc b/src/relay/backend/contrib/clml/codegen.cc index fa082a423d785..b89f05e178579 100644 --- a/src/relay/backend/contrib/clml/codegen.cc +++ b/src/relay/backend/contrib/clml/codegen.cc @@ -91,6 +91,8 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer { json_node = CreateDenseJSONNode(cn); } else if (name == "clml.pad") { json_node = CreatePadJSONNode(cn); + } else if (name == "clml.concat") { + json_node = CreateConcatJSONNode(cn); } else { LOG(FATAL) << "Unrecognized CLML pattern: " << name; } @@ -148,6 +150,15 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer { } else { current_call = current_call->args[0].as(); } + } else if (backend::IsOp(current_call, "clip")) { + nodes.activation = current_call; + nodes.act_type = "relu6"; + if (current_call->args[0].as()) { + auto tuple_item = current_call->args[0].as(); + current_call = tuple_item->tuple.as(); + } else { + current_call = current_call->args[0].as(); + } } if (backend::IsOp(current_call, "nn.batch_norm")) { nodes.bn = current_call; @@ -279,6 +290,32 @@ class CLMLJSONSerializer : public backend::contrib::JSONSerializer { return json_node; } + /*! + * \brief Create a JSON representation of a Concat operator. + * + * \param cn The call to be represented. + * \return A JSON representation of a specific operator. + */ + std::shared_ptr CreateConcatJSONNode(const CallNode* cn) { + const auto* fn = cn->op.as(); + ICHECK(fn); + const auto* concat = fn->body.as(); + + ICHECK(backend::IsOp(concat, "concatenate")); + const auto* concat_op = concat->op.as(); + ICHECK(concat_op); + const std::string name = concat_op->name; + + std::vector inputs; + for (auto arg : cn->args) { + inputs.push_back(VisitExpr(arg)[0]); + } + + auto json_node = std::make_shared(name, "kernel", inputs, 1); + SetCallNodeAttribute(json_node, concat); + return json_node; + } + /*! * \brief Create a JSON representation of a Dense operator. * diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc index da41442ef91d1..cdc3b9a7b51cf 100644 --- a/src/runtime/contrib/clml/clml_runtime.cc +++ b/src/runtime/contrib/clml/clml_runtime.cc @@ -335,13 +335,15 @@ class CLMLRuntime : public JSONRuntimeBase { size_t nid; for (nid = 0; nid < nodes_.size(); ++nid) { const auto& node = nodes_[nid]; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); if (node.GetOpType() == "input") { - auto clml_input = MakeCLMLTensorFromJSONNode(node); + auto clml_input = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); this->layer_.storage_map.insert({nid, std::make_pair(clml_input, node)}); this->layer_.inputs.push_back(clml_input); // Input copy placeholder Tensor this->layer_.in_placeholder.push_back( - MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM)); + MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype)); } else if (node.GetOpType() == "kernel") { auto op_name = node.GetOpName(); if ("nn.conv2d" == op_name) { @@ -364,6 +366,11 @@ class CLMLRuntime : public JSONRuntimeBase { auto out = CreateBatchNormLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); this->layer_.func_outs.push_back(out); + } else if ("nn.max_pool2d" == op_name || "nn.avg_pool2d" == op_name || + "nn.l2_pool2d" == op_name) { + auto out = CreatePoolingLayer(&layer_, node); + this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); + this->layer_.func_outs.push_back(out); } else if ("nn.global_max_pool2d" == op_name || "nn.global_avg_pool2d" == op_name) { auto out = CreateGlobalPoolingLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); @@ -372,6 +379,10 @@ class CLMLRuntime : public JSONRuntimeBase { auto out = CreateReshapeLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); this->layer_.func_outs.push_back(out); + } else if ("concatenate" == op_name) { + auto out = CreateConcatLayer(&layer_, node); + this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); + this->layer_.func_outs.push_back(out); } else if ("nn.dense" == op_name) { auto out = CreateDenseLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); @@ -388,6 +399,11 @@ class CLMLRuntime : public JSONRuntimeBase { auto out = CreateClipLayer(&layer_, node); this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); this->layer_.func_outs.push_back(out); + } else if ("add" == op_name || "subtract" == op_name || "multiply" == op_name || + "minimum" == op_name || "maximum" == op_name) { + auto out = CreateBinaryLayer(&layer_, node); + this->layer_.storage_map.insert({nid, std::make_pair(out, node)}); + this->layer_.func_outs.push_back(out); } else { LOG(FATAL) << "Unsupported op: " << op_name; } @@ -396,10 +412,14 @@ class CLMLRuntime : public JSONRuntimeBase { LOG(WARNING) << "Build Engine: Unknown Node:" << node.GetOpType(); } } - if (nid > 0) { - this->layer_.outputs.push_back(this->layer_.storage_map[nid - 1].first); + + for (size_t i = 0; i < outputs_.size(); ++i) { + nid = outputs_[i].id_; + DLDataType tvm_dtype = nodes_[nid].GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + this->layer_.outputs.push_back(this->layer_.storage_map[nid].first); this->layer_.out_placeholder.push_back( - MakeCLMLTensorFromJSONNode(nodes_[nid - 1], CL_TENSOR_LAYOUT_NCHW_QCOM)); + MakeCLMLTensorFromJSONNode(nodes_[nid], CL_TENSOR_LAYOUT_NCHW_QCOM, cl_dtype)); } // ALlocate device memories and initialize the params if any cl_int result = 0; @@ -558,6 +578,20 @@ class CLMLRuntime : public JSONRuntimeBase { } } + cl_arithmetic_mode_qcom MakeCLArithMode(const cl_channel_type& data_type, + const cl_channel_type& acc_type = CL_FLOAT) { + if (data_type == CL_FLOAT && acc_type == CL_FLOAT) { + return CL_ARITHMETIC_MODE_FP32_QCOM; + } else if (data_type == CL_HALF_FLOAT && acc_type == CL_FLOAT) { + return CL_ARITHMETIC_MODE_FP16_ACC32_QCOM; + } else if (data_type == CL_HALF_FLOAT && acc_type == CL_HALF_FLOAT) { + return CL_ARITHMETIC_MODE_FP16_QCOM; + } else { + LOG(FATAL) << "Datatype " << data_type << " unsupported by CLML runtime"; + return CL_ARITHMETIC_MODE_FP32_QCOM; + } + } + std::shared_ptr MakeCLMLTensor( const JSONGraphNode& tensor_rep, void* data, std::vector c_shape, cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_uint dtype = CL_FLOAT) { @@ -634,6 +668,9 @@ class CLMLRuntime : public JSONRuntimeBase { std::vector strides = node.GetAttr>("strides"); std::vector dilation = node.GetAttr>("dilation"); std::vector clml_padding = GetVectorValues(padding); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); if (!node.HasAttr("padding")) { clml_padding.resize(4); std::fill(clml_padding.begin(), clml_padding.end(), 0); @@ -668,7 +705,7 @@ class CLMLRuntime : public JSONRuntimeBase { has_act = true; } cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; // Collect inputs and outputs, handling nn.conv2d. std::vector inputs = node.GetInputs(); @@ -680,15 +717,15 @@ class CLMLRuntime : public JSONRuntimeBase { has_bias = (num_inputs == 3) || (num_inputs == 7); has_bn = (num_inputs == 6) || (num_inputs == 7); // Input - auto input = MakeCLMLTensorFromJSONEntry(inputs[0]); - + auto input = + MakeCLMLTensorFromJSONEntry(inputs[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); // Weight - auto weight = MakeCLMLTensorFromJSONEntry(inputs[1]); - + auto weight = + MakeCLMLTensorFromJSONEntry(inputs[1], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); // Bias auto bias = std::make_shared(); if (has_bias) { - bias = MakeCLMLTensorFromJSONEntry(inputs[2]); + bias = MakeCLMLTensorFromJSONEntry(inputs[2], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); } else { cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; @@ -698,7 +735,7 @@ class CLMLRuntime : public JSONRuntimeBase { bias->tensor = layer_.unusedTensor; } // Output - auto output = MakeCLMLTensorFromJSONNode(node); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); cl_ml_op_convolution_desc_qcom conv_desc{mode, groups, 4, @@ -707,7 +744,7 @@ class CLMLRuntime : public JSONRuntimeBase { {clml_strides[0], clml_strides[1]}, {clml_dilation[0], clml_dilation[1]}, 0, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; cl_ml_op_qcom op = NULL; if (!has_bn) { @@ -734,13 +771,16 @@ class CLMLRuntime : public JSONRuntimeBase { auto bn_var = std::make_shared(); auto bn_scale = std::make_shared(); auto bn_bias = std::make_shared(); - bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape); - bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape); - bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape); - bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape); - - cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + bn_scale = MakeCLMLTensorFromJSONEntry(inputs[bn_index], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_bias = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 1], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_mean = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 2], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_var = MakeCLMLTensorFromJSONEntry(inputs[bn_index + 3], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + + cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode}; if (!has_act) { result = h_ClmlIntf->clCreateMLOpFusedConvolutionBatchNormForwardQCOM( workspace->context, 0, &conv_desc, &bn_desc, input->tensor, weight->tensor, @@ -772,11 +812,15 @@ class CLMLRuntime : public JSONRuntimeBase { cl_activation_function_qcom clml_act_type = CL_ACTIVATION_RELU) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); cl_ml_op_activation_desc_qcom act_desc = {clml_act_type, CL_PROPAGATE_NAN_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; cl_ml_tensor_desc_qcom desc = {}; desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; @@ -805,7 +849,11 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); int axis = std::stoi(node.GetAttr>("axis")[0]); auto bn_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]); std::vector bn_shape = {1, 1, 1, 1}; @@ -814,15 +862,18 @@ class CLMLRuntime : public JSONRuntimeBase { auto bn_var = std::make_shared(); auto bn_scale = std::make_shared(); auto bn_bias = std::make_shared(); - bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape); - bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape); - bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape); - bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape); + bn_scale = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_mean = MakeCLMLTensorFromJSONEntry(node.GetInputs()[3], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + bn_var = MakeCLMLTensorFromJSONEntry(node.GetInputs()[4], bn_shape, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - auto output = MakeCLMLTensorFromJSONNode(node); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_ml_op_batchnorm_desc_qcom bn_desc = {CL_BATCHNORM_MODE_SPATIAL_QCOM, cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpBatchNormForwardQCOM( workspace->context, 0, &bn_desc, input->tensor, bn_mean->tensor, bn_var->tensor, @@ -834,6 +885,61 @@ class CLMLRuntime : public JSONRuntimeBase { return output; } + /*! + * \brief Create a creating pooling layer. + * + * \note Currently global_max_pool2d and global_avg_pool2d are supported. + * + * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function. + * \param node The JSON representation of the operator. + */ + std::shared_ptr CreatePoolingLayer(CachedLayer* layer, + const JSONGraphNode& node) { + cl_int result = 0; + cl_ml_op_qcom op = NULL; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); + + std::vector windows = node.GetAttr>("pool_size"); + std::vector strides = node.GetAttr>("strides"); + std::vector padding = node.GetAttr>("padding"); + std::vector clml_window = GetVectorValues(windows); + std::vector clml_stride = GetVectorValues(strides); + std::vector clml_padding = GetVectorValues(padding); + + cl_ml_op_pooling_desc_qcom pool_desc = { + node.GetOpName() == "nn.max_pool2d" ? CL_POOLING_MODE_MAX_QCOM + : CL_POOLING_MODE_AVERAGE_EXCLUDE_PADDING_QCOM, + 4, // reserved + {clml_padding[0], clml_padding[1]}, + {clml_padding[2], clml_padding[3]}, + {clml_stride[0], clml_stride[1]}, + {clml_window[0], clml_window[1]}, + CL_PROPAGATE_NAN_QCOM, + cl_arithmetic_mode, + }; + + cl_ml_tensor_desc_qcom desc = {}; + cl_ml_tensor_qcom unusedTensor = NULL; + desc.num_dimensions = CL_TENSOR_UNUSED_QCOM; + result = h_ClmlIntf->clCreateMLTensorQCOM(workspace->context, NULL, &desc, &unusedTensor); + ICHECK(unusedTensor && result == CL_SUCCESS) << ":" << result; + + result = + h_ClmlIntf->clCreateMLOpPoolingForwardQCOM(workspace->context, 0, &pool_desc, input->tensor, + unusedTensor, output->tensor, &op, tuning_cache); + ICHECK(op && result == CL_SUCCESS) << "Pooling Error:" << result; + + layer_.func_ins.push_back(input); + layer->function.push_back(op); + return output; + } + /*! * \brief Create a global pooling layer. * @@ -846,8 +952,12 @@ class CLMLRuntime : public JSONRuntimeBase { CachedLayer* layer, const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); auto in_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); cl_ml_op_pooling_desc_qcom pool_desc = { node.GetOpName() == "nn.global_max_pool2d" ? CL_POOLING_MODE_MAX_QCOM @@ -858,7 +968,7 @@ class CLMLRuntime : public JSONRuntimeBase { {1, 1}, {in_dims.w, in_dims.h}, CL_PROPAGATE_NAN_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM, + cl_arithmetic_mode, }; cl_ml_tensor_desc_qcom desc = {}; @@ -887,14 +997,17 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); auto out_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); - auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, CL_FLOAT, nullptr, + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype, nullptr, {out_dims.n, out_dims.c, 1, 1}); cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, - CL_SOFTMAX_MODE_INSTANCE_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpSoftmaxQCOM(workspace->context, 0, &softmax_desc, input->tensor, output->tensor, &op, tuning_cache); @@ -915,8 +1028,12 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); std::string pad_mode = node.GetAttr>("pad_mode")[0]; std::vector padding = node.GetAttr>("pad_width"); @@ -936,7 +1053,7 @@ class CLMLRuntime : public JSONRuntimeBase { clml_pad_mode, {0, 0}, {clml_padding[0], clml_padding[1], clml_padding[2], clml_padding[3], 0, 0, 0, 0}, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpPadQCOM(workspace->context, 0, &pad_desc, input->tensor, output->tensor, &op, tuning_cache); @@ -957,8 +1074,11 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); result = h_ClmlIntf->clCreateMLOpReshapeQCOM(workspace->context, 0, input->tensor, output->tensor, &op, tuning_cache); @@ -969,6 +1089,42 @@ class CLMLRuntime : public JSONRuntimeBase { return output; } + /*! + * \brief Create a concat layer. + * + * + * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function. + * \param node The JSON representation of the operator. + */ + std::shared_ptr CreateConcatLayer(CachedLayer* layer, + const JSONGraphNode& node) { + cl_int result = 0; + cl_ml_op_qcom op = NULL; + std::vector input_ = node.GetInputs(); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + int inputSize = input_.size(); + int axis = std::stoi(node.GetAttr>("axis")[0]); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + cl_ml_tensor_qcom* concatInputs = new cl_ml_tensor_qcom[inputSize]; + for (int i = 0; i < inputSize; i++) { + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[i], {}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + concatInputs[i] = input->tensor; + } + cl_ml_op_concat_desc_qcom concatDesc = {1, (cl_uint)inputSize, cl_arithmetic_mode}; + + result = h_ClmlIntf->clCreateMLOpConcatQCOM(workspace->context, 0, &concatDesc, concatInputs, + output->tensor, &op, tuning_cache); + ICHECK(op && result == CL_SUCCESS) << "Concat Error:" << result; + + layer->function.push_back(op); + + delete[] concatInputs; + return output; + } + /*! * \brief Create a dense layer. * @@ -980,21 +1136,27 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto inp_dims = get_tensor_dims(nodes_[node.GetInputs()[0].id_]); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {1, inp_dims.c, 1, 1}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); auto wt_dims = get_tensor_dims(nodes_[node.GetInputs()[1].id_]); bool has_bias = node.GetInputs().size() == 3 ? true : false; - - auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c}); + auto weight = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {1, 1, wt_dims.n, wt_dims.c}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); auto bias = std::make_shared(); if (has_bias) { auto bias_dims = get_tensor_dims(nodes_[node.GetInputs()[2].id_]); - bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1}); + bias = MakeCLMLTensorFromJSONEntry(node.GetInputs()[2], {1, bias_dims.c, 1, 1}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); } cl_ml_op_fully_connected_desc_qcom fc_desc = {1, CL_FC_WEIGHT_TRANSFORM_TRANSPOSE_QCOM, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_arithmetic_mode}; + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); - auto output = MakeCLMLTensorFromJSONNode(node); if (has_bias) { result = h_ClmlIntf->clCreateMLOpFullyConnectedQCOM( workspace->context, 0, &fc_desc, input->tensor, weight->tensor, bias->tensor, @@ -1021,15 +1183,17 @@ class CLMLRuntime : public JSONRuntimeBase { const JSONGraphNode& node) { cl_int result = 0; cl_ml_op_qcom op = NULL; - auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0]); - auto output = MakeCLMLTensorFromJSONNode(node); + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, + cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); cl_float a_max = std::stof(node.GetAttr>("a_max")[0]); cl_float a_min = std::stof(node.GetAttr>("a_min")[0]); - cl_ml_op_clip_desc_qcom clip_desc = {CL_CLIP_BY_VALUE_QCOM, - {{a_max}, CL_FLOAT}, - {{a_min}, CL_FLOAT}, - CL_ARITHMETIC_MODE_FP32_QCOM}; + cl_ml_op_clip_desc_qcom clip_desc = { + CL_CLIP_BY_VALUE_QCOM, {{a_max}, CL_FLOAT}, {{a_min}, CL_FLOAT}, cl_arithmetic_mode}; result = h_ClmlIntf->clCreateMLOpClipQCOM(workspace->context, 0, &clip_desc, input->tensor, output->tensor, &op, tuning_cache); @@ -1040,6 +1204,47 @@ class CLMLRuntime : public JSONRuntimeBase { return output; } + /*! + * \brief Create a Binary layer. + * + * \param layer The CLML layer to build. Containing inputs, outputs and the CLML output. + * \param node The JSON representation of the operator. + */ + std::shared_ptr CreateBinaryLayer(CachedLayer* layer, + const JSONGraphNode& node) { + cl_int result = 0; + cl_ml_op_qcom op = NULL; + DLDataType tvm_dtype = node.GetOpDataType()[0]; + cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype); + cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype); + auto input_a = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0], {}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + auto input_b = MakeCLMLTensorFromJSONEntry(node.GetInputs()[1], {}, + CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + auto output = MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype); + std::string op_name = node.GetOpName(); + cl_binary_op_qcom binary_op = CL_TENSOR_OP_ADD_QCOM; + if (op_name == "subtract") + binary_op = CL_TENSOR_OP_SUB_QCOM; + else if (op_name == "multiply") + binary_op = CL_TENSOR_OP_MUL_QCOM; + else if (op_name == "minimum") + binary_op = CL_TENSOR_OP_MIN_QCOM; + else if (op_name == "maximum") + binary_op = CL_TENSOR_OP_MAX_QCOM; + cl_ml_op_binary_desc_qcom add_desc = { + binary_op, {{1.0}, CL_FLOAT}, {{1.0}, CL_FLOAT}, {{0.0}, CL_FLOAT}, cl_arithmetic_mode}; + + result = h_ClmlIntf->clCreateMLOpBinaryQCOM(workspace->context, 0, &add_desc, input_a->tensor, + input_b->tensor, output->tensor, &op, tuning_cache); + ICHECK(op && result == CL_SUCCESS) << op_name << " Node Error:" << result; + + layer_.func_ins.push_back(input_a); + layer_.func_ins.push_back(input_b); + layer->function.push_back(op); + return output; + } + /*! * \brief The network layers represented by acl functions. * \note Currently only supports a single layer. diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py index 0cf76079e8fba..08b11525ecd2c 100644 --- a/tests/python/contrib/test_clml/infrastructure.py +++ b/tests/python/contrib/test_clml/infrastructure.py @@ -29,6 +29,7 @@ from tvm.contrib import graph_executor from tvm.relay.op.contrib import clml from tvm.contrib import utils +from tvm import autotvm from tvm.autotvm.measure import request_remote from tvm.relay.expr_functor import ExprMutator, Call @@ -144,35 +145,28 @@ def skip_codegen_test(): return True -def build_module(mod, target, target_host, params=None, enable_clml=True): +def build_module(mod, target, target_host, params=None, enable_clml=True, tune_log=""): """Build module with option to build for CLML.""" if isinstance(mod, tvm.relay.expr.Call): mod = tvm.IRModule.from_expr(mod) - with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): - if enable_clml: - mod = clml.partition_for_clml(mod, params) - relay.backend.te_compiler.get().clear() - # print("Build Mod:", mod) - return relay.build(mod, target=target, target_host=target_host, params=params) + with autotvm.apply_history_best(tune_log): + with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): + if enable_clml: + mod = clml.partition_for_clml(mod, params) + relay.backend.te_compiler.get().clear() + return relay.build(mod, target=target, target_host=target_host, params=params) def build_and_run( - mod, - inputs, - outputs, - params, - device, - enable_clml=True, - no_runs=1, - config=None, + mod, inputs, outputs, params, device, enable_clml=True, no_runs=1, config=None, tune_log="" ): """Build and run the relay module.""" if config is None: config = {} try: - libm = build_module(mod, device.target, device.target_host, params, enable_clml) + libm = build_module(mod, device.target, device.target_host, params, enable_clml, tune_log) clml_modules = extract_clml_modules(libm) for mod in clml_modules: @@ -198,7 +192,7 @@ def build_and_run( for _ in range(no_runs): gen_module.run() out.append([gen_module.get_output(i) for i in range(outputs)]) - time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=50) + time_f = gen_module.module.time_evaluator("run", device.device.cl(0), number=1) cost = time_f().mean print("%g secs/iteration\n" % cost) return out diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py index 405f5782ff2e3..0803c787d98e7 100644 --- a/tests/python/contrib/test_clml/test_network.py +++ b/tests/python/contrib/test_clml/test_network.py @@ -25,20 +25,13 @@ from test_clml.infrastructure import skip_runtime_test, build_and_run, Device -def _build_and_run_network(mod, params, inputs, data, device, atol, rtol): +def _build_and_run_network(mod, params, inputs, data, device, atol, rtol, tvm_log=""): """Helper function to build and run a network.""" outputs = [] for clml in [True, False]: outputs.append( - build_and_run( - mod, - data, - 1, - params, - device, - enable_clml=clml, - )[0] + build_and_run(mod, data, 1, params, device, enable_clml=clml, tune_log=tvm_log)[0][0] ) return outputs @@ -55,11 +48,7 @@ def _get_keras_model(keras_model, inputs_dict, data): def get_bottom_top_model(model, layer_name): layer = model.get_layer(layer_name) bottom_input = model.layers[0].input - bottom_output = bottom_input - for layer in model.layers: - bottom_output = layer(bottom_output) - if layer.name == layer_name: - break + bottom_output = layer.output bottom_model = Model(bottom_input, bottom_output) return bottom_model @@ -81,6 +70,8 @@ def test_mobilenet(): def get_model(): from tensorflow.keras.applications import MobileNet + import tensorflow as tf + tf.keras.backend.clear_session() mobilenet = MobileNet( include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000 @@ -106,32 +97,111 @@ def get_model(): ) # test - print("OpenCL:", outputs[0][0].asnumpy().shape) - print("CLML:", outputs[1][0].asnumpy().shape) + print("OpenCL:", outputs[0].asnumpy().shape) + print("CLML:", outputs[1].asnumpy().shape) - opencl_sort = np.argsort(outputs[1][0].asnumpy()).flatten() - clml_sort = np.argsort(outputs[0][0].asnumpy()).flatten() + opencl_sort = np.argsort(outputs[1].asnumpy()).flatten() + clml_sort = np.argsort(outputs[0].asnumpy()).flatten() tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5) -""" - tvm.testing.assert_allclose( - ref_outputs, outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to Keras looks good") - tvm.testing.assert_allclose( - outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to CLML looks good") - exit(0) +def test_inception_v3(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + + def get_model(): + from tensorflow.keras.applications import InceptionV3 + import tensorflow as tf + tf.keras.backend.clear_session() + + inceptionV3 = InceptionV3( + include_top=True, weights=None, input_shape=(299, 299, 3), classes=1000 + ) + inputs = {inceptionV3.input_names[0]: ((1, 3, 299, 299), "float16")} + + data = {} + np.random.seed(0) + for name, (shape, dtype) in inputs.items(): + if dtype == "uint8": + low, high = 0, 1 + else: + low, high = -2, 1 + data[name] = np.random.uniform(low, high, shape).astype(dtype) + + mod, params, ref_outputs = _get_keras_model(inceptionV3, inputs, data) + return mod, params, inputs, data, ref_outputs + + mod, params, inputs, input_data, ref_outputs = get_model() + outputs = _build_and_run_network( + mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5 + ) + + opencl_sort = np.argsort(outputs[1].asnumpy()).flatten() + clml_sort = np.argsort(outputs[0].asnumpy()).flatten() + + tvm.testing.assert_allclose(opencl_sort[:5], clml_sort[:5], rtol=1e-5, atol=1e-5) - tvm.testing.assert_allclose( - ref_outputs.transpose(0, 3, 1, 2), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to Keras looks good") - tvm.testing.assert_allclose( - outputs[0][0].asnumpy(), outputs[1][0].asnumpy(), rtol=1e-5, atol=1e-5) - print("OpenCL to CLML looks good") -""" + +def test_resnet50v2(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + + def get_model(): + from tensorflow.keras.applications import ResNet50V2 + import tensorflow as tf + tf.keras.backend.clear_session() + + model = ResNet50V2(include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000) + inputs_dict = {model.input_names[0]: ((1, 3, 224, 224), "float32")} + + data = {} + np.random.seed(0) + + for name, (shape, dtype) in inputs_dict.items(): + if dtype == "uint8": + low, high = 0, 1 + else: + low, high = -1, 1 + data[name] = np.random.uniform(low, high, shape).astype(dtype) + + """Convert Keras graph to relay.""" + inputs = {} + for name, (shape, _) in inputs_dict.items(): + inputs[model.input_names[0]] = shape + + ref_outputs = model.predict(data["input_1"].transpose(0, 2, 3, 1)) + + mod, params = relay.frontend.from_keras(model, inputs, layout="NCHW") + + return mod, params, inputs, data, ref_outputs + + mod, params, inputs, input_data, ref_outputs = get_model() + outputs = _build_and_run_network( + mod, params, inputs, input_data, device=device, atol=1e-5, rtol=1e-5 + ) + + # test + print("OpenCL:", outputs[0].asnumpy().shape) + print("CLML:", outputs[1].asnumpy().shape) + + opencl_sort = np.argsort(outputs[1].asnumpy()).flatten() + clml_sort = np.argsort(outputs[0].asnumpy()).flatten() + + tvm.testing.assert_allclose(opencl_sort[:10], clml_sort[:10], rtol=1e-5, atol=1e-5) if __name__ == "__main__": test_mobilenet() + test_resnet50v2() + test_inception_v3() diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py index 13f49d1527146..b620551041273 100644 --- a/tests/python/contrib/test_clml/test_ops.py +++ b/tests/python/contrib/test_clml/test_ops.py @@ -211,6 +211,87 @@ def test_batchnorm(): ) +def test_concat(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + in_shape_1 = (1, 16, 16, 16) + in_shape_2 = (1, 16, 16, 16) + a = relay.var("input_1", shape=in_shape_1, dtype=dtype) + b = relay.var("input_2", shape=in_shape_2, dtype=dtype) + low, high = -1, 1 + inputs = { + "input_1": tvm.nd.array(np.random.uniform(-1, 1, in_shape_1).astype(dtype)), + "input_2": tvm.nd.array(np.random.uniform(-1, 1, in_shape_2).astype(dtype)), + } + + params = {} + func = relay.concatenate((a, b), axis=1) + mod = IRModule.from_expr(func) + + opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0] + clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0] + + tvm.testing.assert_allclose( + clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3 + ) + + +def test_avgpool(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + dtype = "float16" + trials = [ + # input size pool_size stride paading + [(1, 64, 147, 147), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 192, 71, 71), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 288, 35, 35), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 768, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 2048, 17, 17), (3, 3), (2, 2), (0, 0, 0, 0), "max"], + [(1, 192, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 256, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 288, 35, 35), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 768, 17, 17), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + [(1, 1280, 8, 8), (3, 3), (1, 1), (0, 0, 1, 1), "avg"], + ] + params = {} + for ( + input_shape, + pool_size, + stride, + padding, + pooling_type, + ) in trials: + a = relay.var("input_1", shape=input_shape, dtype=dtype) + input_arr = tvm.nd.array(np.random.uniform(-1, 1, input_shape).astype(dtype)) + inputs = { + "input_1": input_arr, + } + + if pooling_type == "max": + func = relay.nn.max_pool2d(a, pool_size=pool_size, strides=stride, padding=padding) + else: + func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, padding=padding) + mod = IRModule.from_expr(func) + + opencl_out = build_and_run(mod, inputs, 1, params, device, enable_clml=False)[0] + clml_out = build_and_run(mod, inputs, 1, params, device, enable_clml=True)[0] + + tvm.testing.assert_allclose( + clml_out[0].asnumpy(), opencl_out[0].asnumpy(), rtol=1e-3, atol=1e-3 + ) + + if __name__ == "__main__": test_conv2d() - test_batchnorm() + #test_batchnorm() + test_avgpool() + test_concat()