From 1296cc13aff0a4541376d7f9c05848c73e8f3dac Mon Sep 17 00:00:00 2001 From: weishengying <1343838695@qq.com> Date: Mon, 7 Mar 2022 09:05:23 +0000 Subject: [PATCH] [BugFix]fix bug of opt tool --- .../optimizer/mir/static_kernel_pick_pass.cc | 21 ++++-- .../optimizer/mir/static_kernel_pick_pass.h | 67 ++++++++++++++--- lite/core/program.cc | 2 + lite/core/type_system.h | 58 ++++----------- lite/kernels/arm/slice_compute.cc | 72 +++++++++++++++++++ 5 files changed, 159 insertions(+), 61 deletions(-) diff --git a/lite/core/optimizer/mir/static_kernel_pick_pass.cc b/lite/core/optimizer/mir/static_kernel_pick_pass.cc index 66f38b812c9..92695aa9ed7 100644 --- a/lite/core/optimizer/mir/static_kernel_pick_pass.cc +++ b/lite/core/optimizer/mir/static_kernel_pick_pass.cc @@ -41,10 +41,14 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { CHECK(graph) << "graph not valid"; // sort kernels by the factors. - VLOG(4) << "graph->mutable_nodes().size():" << graph->mutable_nodes().size(); + VLOG(2) << "graph block_idx: " << graph->blockIdx(); + VLOG(2) << "graph->mutable_nodes().size(): " << graph->mutable_nodes().size(); + size_t idx = 0; for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; auto& instruct = node.AsStmt(); + VLOG(2) << "pick kernel for op : " << instruct.op_type() << ", in block " + << graph->blockIdx() << ", idx : " << idx++; std::map in_types; std::map out_types; @@ -66,17 +70,19 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { std::vector>> scored; CHECK(!instruct.kernels().empty()) << "No kernels found for " << instruct.op_type(); - VLOG(4) << "instruct.kernels().size():" << instruct.kernels().size(); + + VLOG(2) << "candidate kernels size:" << instruct.kernels().size(); for (auto&& kernel : instruct.kernels()) { - float score = KernelGrade(instruct, + VLOG(2) << "current candidate kernel is: " << kernel->summary(); + VLOG(2) << "valid_places size is: " << graph->valid_places().size(); + float score = KernelGrade(&node, *kernel, graph->valid_places(), in_types, out_types, instruct.op_info()->input_names(), instruct.op_info()->output_names()); - VLOG(4) << "kernel->summary():" << kernel->summary() - << " score:" << score; + scored.emplace_back(score, std::move(kernel)); } std::stable_sort(scored.begin(), scored.end(), KernelScoreCmp); @@ -87,7 +93,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { // Just keep a single best kernel. // TODO(Superjomn) reconsider this. instruct.kernels().emplace_back(std::move(scored.front().second)); - VLOG(2) << "pick " << instruct.kernels().front()->summary() << "\n\n"; + VLOG(2) << "the final pick kernel is " + << instruct.kernels().front()->summary() << "\n\n"; } else { bool out_type_int8 = true; @@ -137,7 +144,7 @@ void StaticKernelPickPass::Apply(const std::unique_ptr& graph) { instruct.ResetOp(update_desc, graph->valid_places()); scored.clear(); for (auto&& kernel : instruct.kernels()) { - float score = KernelGrade(instruct, + float score = KernelGrade(&node, *kernel, graph->valid_places(), in_types, diff --git a/lite/core/optimizer/mir/static_kernel_pick_pass.h b/lite/core/optimizer/mir/static_kernel_pick_pass.h index ab9ed33e685..6238d17a88f 100644 --- a/lite/core/optimizer/mir/static_kernel_pick_pass.h +++ b/lite/core/optimizer/mir/static_kernel_pick_pass.h @@ -50,13 +50,14 @@ class StaticKernelPickPass : public mir::StmtPass { private: // Score the kernel. - size_t KernelGrade(const lite::mir::Node::Stmt& instruct, + size_t KernelGrade(lite::mir::Node* node, const lite::KernelBase& kernel, const std::vector& places, const std::map& in_types, const std::map& out_types, const std::vector& in_names, const std::vector& out_names) { + const auto& instruct = node->AsStmt(); CHECK_GT(places.size(), static_cast(0)) << "valid_places is empty."; float final_score{-1.}; Place winner_place{places[0]}; @@ -76,14 +77,19 @@ class StaticKernelPickPass : public mir::StmtPass { for (size_t i = 0; i < place_size; ++i) { const auto& place = places[i]; float weight = static_cast(place_size - i) / place_size; + VLOG(4) << "current place is " << place.DebugString() << ", idx : " << i + << ", weight : " << weight; size_t score{}; // The more important factor comes first if (kernel_pick_factors_.IsTargetConsidered() && (place.target == kernel.target() || kernel.target() == TARGET(kAny) || place.target == TARGET(kAny))) { - score += kMax / - static_cast(core::KernelPickFactor::Factor::TargetFirst); + size_t target_score = + kMax / + static_cast(core::KernelPickFactor::Factor::TargetFirst); + score += target_score; + VLOG(4) << "[TargetConsidered score]:" << target_score; } VLOG(4) << "[score s1]:" << score; if (kernel_pick_factors_.IsPrecisionConsidered() && @@ -93,8 +99,11 @@ class StaticKernelPickPass : public mir::StmtPass { // score skipped, if kernel is int8, but op is not int8 if (!(kernel.precision() == PRECISION(kInt8) && !instruct.op_info()->HasAttr("enable_int8"))) { - score += kMax / static_cast( - core::KernelPickFactor::Factor::PrecisionFirst); + size_t precision_score = + kMax / + static_cast(core::KernelPickFactor::Factor::PrecisionFirst); + score += precision_score; + VLOG(4) << "[PrecisionConsidered score]:" << precision_score; } } VLOG(4) << "[score s2]:" << score; @@ -102,8 +111,11 @@ class StaticKernelPickPass : public mir::StmtPass { (place.layout == kernel.layout() || kernel.layout() == DATALAYOUT(kAny) || place.layout == DATALAYOUT(kAny))) { - score += kMax / static_cast( - core::KernelPickFactor::Factor::DataLayoutFirst); + size_t datalayout_score = + kMax / + static_cast(core::KernelPickFactor::Factor::DataLayoutFirst); + score += datalayout_score; + VLOG(4) << "[DataLayoutConsidered score]:" << datalayout_score; } VLOG(4) << "[score s3]:" << score; @@ -138,10 +150,44 @@ class StaticKernelPickPass : public mir::StmtPass { } if (type_match) { score *= 2; + VLOG(4) << "[Input precision compatible]: *2"; } VLOG(4) << "[score s4]:" << score; } + // add new rules for datatype: When the input types are consistent with + // kernel's input types, select the kernel of the datatype. + if (instruct.op_info()->Type() != "conditional_block" && + instruct.op_info()->Type() != "while" && + instruct.op_info()->Type() != "subgraph") { + bool datatype_match = true; + for (auto* in : node->inlinks) { + if (!in->IsArg()) continue; + if (in->AsArg().name == "feed" || in->AsArg().is_persist) continue; + std::string argname; + instruct.op_info()->GetInputArgname(in->AsArg().name, &argname); + VLOG(5) << "intput var name : " << in->AsArg().name; + // only when datatype is LOD_TENSOR, LOD_TENSOR_ARRAY, STEP_SCOPES, + // the type pointer is not null; + if (in->AsArg().type) { + VLOG(5) << "input datatype : " + << static_cast(in->AsArg().type->id()); + VLOG(5) << "kernel bind datatype : " + << static_cast(kernel.GetInputDeclType(argname)->id()); + if (static_cast(in->AsArg().type->id()) != + static_cast(kernel.GetInputDeclType(argname)->id())) + datatype_match = false; + } else { + datatype_match = false; + } + } + if (datatype_match) { + score *= 2; + VLOG(4) << "[Input datatype compatible]: *2"; + } + VLOG(4) << "[score s5]:" << score; + } + if (weight * score > final_score) { final_score = weight * score; winner_place = place; @@ -191,9 +237,8 @@ class StaticKernelPickPass : public mir::StmtPass { } } - VLOG(4) << "[score(final)]:" << final_score; - VLOG(2) << "-------- pick summary for " << instruct.op_type() - << " --------"; + VLOG(2) << "-------- score summary for candidate kernel : " + << kernel.summary() << " --------"; VLOG(2) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) << " " << DataLayoutToStr(winner_place.layout) << " " << TargetToStr(winner_place.target); @@ -203,8 +248,8 @@ class StaticKernelPickPass : public mir::StmtPass { << TargetToStr(kernel.place().target); VLOG(4) << "kernel.op_type():" << kernel.op_type(); VLOG(4) << "kernel picker factors:" << kernel_pick_factors_; - VLOG(4) << "kernel place:" << kernel.place().DebugString(); VLOG(4) << "winner_picker place:" << winner_place.DebugString(); + VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "------------------------------"; // The data layout is not considered, for the input and output arguments diff --git a/lite/core/program.cc b/lite/core/program.cc index 88a58c0569d..25cc36b3c15 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -606,6 +606,8 @@ void Program::PrepareWorkspace( } else if (var_type == lite::VarDescAPI::Type::LOD_TENSOR_ARRAY) { var_type_map_[var_name] = LiteType::GetTensorListTy( TARGET(kUnk), PRECISION(kUnk), DATALAYOUT(kUnk)); + auto* tensor_array = var->GetMutable>(); + tensor_array->resize(0); } else if (var_type == lite::VarDescAPI::Type::STEP_SCOPES) { var->GetMutable>(); } diff --git a/lite/core/type_system.h b/lite/core/type_system.h index d51ee833114..2a7dc924e68 100644 --- a/lite/core/type_system.h +++ b/lite/core/type_system.h @@ -60,15 +60,11 @@ namespace lite { // We use Types to declare the definition of a kernel, each inputs' and outputs' // arguments have a specific Types. // -// REGISTER_LITE_KERNEL(mul, kHost, kFloat, -// paddle::lite::kernels::host::MulCompute, def) -// .BindInput("X", {paddle::lite::Type::Get( -// TARGET(kHost))}) -// .BindInput("Y", {paddle::lite::Type::Get( -// TARGET(kHost))}) -// .BindOutput("Out", -// {paddle::lite::Type::Get(TARGET(kHost))}) -// .Finalize(); +// REGISTER_LITE_KERNEL(mul, kARM, kInt8, kNCHW, Mul_int8_f32, def) +// .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) +// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) +// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) +// .Finalize(); // // The above definition will be used in MIR by Type inference and uncompatible // types check. @@ -116,13 +112,13 @@ class DataType { }; /* - * Datatype with device info considered. - * NOTE A Type with different device is treated as different DeviceDataType. + * Datatype with Place info considered. + * NOTE A Type with different Place info is treated as different Type. */ class Type : public DataType { public: // Can cast to another type. This is heavily used in MIR, by determine whether - // is is possible to add a statement to transform a type to another. + // is possible to add a statement to transform a type to another. virtual bool TypeCastable(const Type& type) const { return id_ == type.id(); } /// Get a Tensor type. @@ -258,30 +254,6 @@ struct ParamType { std::string DebugString() const { return type->name(); } }; -/* - * The data types of kernel parameters. It is used to track the type of kernel's - * inputs and outputs. - */ -struct ParamTypeRecorder { - std::map inputs; - std::map outputs; - - void RegisterInputType(const std::string& arg_name, const ParamType& type) { - Register(&inputs, arg_name, type); - } - - void RegisterOutputType(const std::string& arg_name, const ParamType& type) { - Register(&outputs, arg_name, type); - } - - private: - void Register(std::map* ts, - const std::string& arg_name, - ParamType type) { - (*ts)[arg_name] = type; - } -}; - /* * The ParamTypeRegistry help register the input and output data types for all * the kernels. It is made singleton so that all the objects of the same kernel @@ -296,19 +268,19 @@ struct ParamTypeRecorder { class ParamTypeRegistry { public: enum class IO : int { kInvalid = 0, kInput, kOutput }; - - template /* * Helper class for registering a ParamType for a Kernel. * Usage: * * NewInstance("fc") - * .BindInput(0, {typeid(Tensor).hash_code(), {TARGET(kHost)}) - * .BindInput(1, {typeid(Tensor).hash_code(), {TARGET(kHost), - * PRECISION(kFloat)}); + * .BindInput("Input_0", {Type::GetTensorTy(TARGET(kHost), + * PRECISION(kInt64))}) + * .BindInput("Input_1", {Type::GetTensorTy(TARGET(kHost), + * PRECISION(kInt64))}); */ + template struct NewInstance { explicit NewInstance(const std::string& kernel_type) : kernel_type_(kernel_type) {} diff --git a/lite/kernels/arm/slice_compute.cc b/lite/kernels/arm/slice_compute.cc index a25e2ebd0ab..507a5c087ad 100644 --- a/lite/kernels/arm/slice_compute.cc +++ b/lite/kernels/arm/slice_compute.cc @@ -262,6 +262,20 @@ REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_float, def) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .Finalize(); +REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_float, array_def) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .Finalize(); + REGISTER_LITE_KERNEL( slice, kARM, kFloat, kNCHW, slice_float, float_i64_starts_ends) .BindInput("Input", @@ -277,6 +291,21 @@ REGISTER_LITE_KERNEL( .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .Finalize(); +REGISTER_LITE_KERNEL( + slice, kARM, kFloat, kNCHW, slice_float, array_float_i64_starts_ends) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .Finalize(); + using slice_boolean = paddle::lite::kernels::arm::SliceCompute; REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_boolean, bool_slice) @@ -292,6 +321,21 @@ REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_boolean, bool_slice) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))}) .Finalize(); +REGISTER_LITE_KERNEL( + slice, kARM, kFloat, kNCHW, slice_boolean, array_bool_slice) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kBool))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))}) + .Finalize(); + using slice_int32 = paddle::lite::kernels::arm::SliceCompute; REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_int32, int32_slice) @@ -308,6 +352,20 @@ REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_int32, int32_slice) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .Finalize(); +REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_int32, array_int32_slice) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .Finalize(); + using slice_int64 = paddle::lite::kernels::arm::SliceCompute; @@ -324,3 +382,17 @@ REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_int64, def_int64) {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) .Finalize(); + +REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_int64, array_def_int64) + .BindInput("Input", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("StartsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("EndsTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("StartsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindInput("EndsTensorList", + {LiteType::GetTensorListTy(TARGET(kARM), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))}) + .Finalize();