From 337718f2a64682bebb91a0ecdee6ab3c0c455b93 Mon Sep 17 00:00:00 2001 From: shanliang1992 Date: Fri, 8 Apr 2022 13:44:10 +0800 Subject: [PATCH 01/13] [XPU] xpu lstm bug (#8777) --- lite/kernels/xpu/rnn_compute.cc | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/lite/kernels/xpu/rnn_compute.cc b/lite/kernels/xpu/rnn_compute.cc index f6b36f99a6f..1c46c46b206 100644 --- a/lite/kernels/xpu/rnn_compute.cc +++ b/lite/kernels/xpu/rnn_compute.cc @@ -158,18 +158,20 @@ void RnnCompute::Run() { } float* output_ptr = output->mutable_data(TARGET(kXPU)); - Tensor internal_output_1_tensor, internal_output_2_tensor; + XPUScratchPadGuard internal_output_1_guard, internal_output_2_guard; float* internal_output_1_ptr = nullptr; float* internal_output_2_ptr = nullptr; if (num_layers >= 2) { - internal_output_1_tensor.Resize(output->dims()); + internal_output_1_guard = + TargetWrapperXPU::MallocScratchPad(output->numel() * sizeof(float)); internal_output_1_ptr = - internal_output_1_tensor.mutable_data(TARGET(kXPU)); + reinterpret_cast(internal_output_1_guard->addr_); } if (num_layers >= 3) { - internal_output_2_tensor.Resize(output->dims()); + internal_output_2_guard = + TargetWrapperXPU::MallocScratchPad(output->numel() * sizeof(float)); internal_output_2_ptr = - internal_output_2_tensor.mutable_data(TARGET(kXPU)); + reinterpret_cast(internal_output_2_guard->addr_); } // PreState and State const float* init_h_ptr = pre_state[0]->data(); @@ -203,11 +205,12 @@ void RnnCompute::Run() { } if (is_bidirec) { - std::vector output_vec(2); + std::vector output_vec(2); std::vector output_ptr_vec(2); for (int i = 0; i < 2; ++i) { - output_vec[i].Resize({seq_len, batch_size, hdim}); - output_ptr_vec[i] = output_vec[i].mutable_data(TARGET(kXPU)); + output_vec[i] = TargetWrapperXPU::MallocScratchPad( + seq_len * batch_size * hdim * sizeof(float)); + output_ptr_vec[i] = reinterpret_cast(output_vec[i]->addr_); } runLSTMLayer(ctx.GetRawContext(), @@ -246,16 +249,9 @@ void RnnCompute::Run() { i, 1); // concat - std::vector x_list; - std::vector> xdims_list; - for (int i = 0; i < 2; i++) { - auto& x = output_vec[i]; - x_list.push_back(output_ptr_vec[i]); - xdims_list.push_back(std::vector()); - for (int j = 0; j < static_cast(x.dims().size()); j++) { - xdims_list[i].push_back(x.dims()[j]); - } - } + std::vector x_list{output_ptr_vec[0], output_ptr_vec[1]}; + std::vector> xdims_list{{seq_len, batch_size, hdim}, + {seq_len, batch_size, hdim}}; int r = xdnn::concat( ctx.GetRawContext(), x_list, cur_output_ptr, xdims_list, 2); From d56160f33fe6a804febc3884c6d730787a2f771d Mon Sep 17 00:00:00 2001 From: quwei03 <32065370+xiuxin121@users.noreply.github.com> Date: Thu, 14 Apr 2022 14:48:09 +0800 Subject: [PATCH 02/13] [XPU] add pad2d edge mode (#8827) --- lite/kernels/xpu/pad2d_compute.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lite/kernels/xpu/pad2d_compute.cc b/lite/kernels/xpu/pad2d_compute.cc index 1e1417a0f81..60d56ac2b9d 100644 --- a/lite/kernels/xpu/pad2d_compute.cc +++ b/lite/kernels/xpu/pad2d_compute.cc @@ -47,7 +47,7 @@ void Pad2dCompute::Run() { } T* out_data = out->template mutable_data(TARGET(kXPU)); - if (mode == "reflect" || mode == "constant") { + if (mode == "reflect" || mode == "constant" || mode == "edge") { int r = xdnn::pad2d(ctx.GetRawContext(), in_data, out_data, From 7e20bbb1ba494f8176d9e890e1c3a6c5eb8c2204 Mon Sep 17 00:00:00 2001 From: newway <237745+newway@users.noreply.github.com> Date: Fri, 15 Apr 2022 16:54:49 +0800 Subject: [PATCH 03/13] [XPU] Fix wrong tensor in parent scope brought in 993dd (#8821) --- lite/core/optimizer/mir/fusion/__xpu__bigru_fuse_pass.cc | 3 +-- .../mir/fusion/__xpu__conv2d_affine_channel_fuse_pass.cc | 3 +-- lite/core/optimizer/mir/fusion/__xpu__conv2d_fuse_pass.cc | 3 +-- lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc | 3 +-- .../__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc | 4 ++-- .../optimizer/mir/fusion/__xpu__multi_softmax_fuse_pass.cc | 3 +-- .../mir/fusion/__xpu__squeeze_excitation_fuse_pass.cc | 6 ++---- 7 files changed, 9 insertions(+), 16 deletions(-) diff --git a/lite/core/optimizer/mir/fusion/__xpu__bigru_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__bigru_fuse_pass.cc index f2223c996cd..4ea480984af 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__bigru_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__bigru_fuse_pass.cc @@ -135,8 +135,7 @@ class RefactorBackwardGRUv1 : public FuseBase { auto* seq_rev_in_node = graph->NewArgumentNode(seq_rev_in_name); seq_rev_in_node->arg()->type = LiteType::GetTensorTy( TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto* seq_rev_in_tensor = - scope->MutableParent()->NewTensor(seq_rev_in_name); + auto* seq_rev_in_tensor = scope->NewTensor(seq_rev_in_name); seq_rev_in_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat); seq_rev_in_tensor->set_persistable(true); diff --git a/lite/core/optimizer/mir/fusion/__xpu__conv2d_affine_channel_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__conv2d_affine_channel_fuse_pass.cc index a320fbe5086..825c3474cd7 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__conv2d_affine_channel_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__conv2d_affine_channel_fuse_pass.cc @@ -273,8 +273,7 @@ class XPUConv2dAffineChannelFuser : public FuseBase { auto* max_output_node = graph->NewArgumentNode(max_output_name); max_output_node->arg()->type = LiteType::GetTensorTy( TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto* max_output_tensor = - scope->MutableParent()->NewTensor(max_output_name); + auto* max_output_tensor = scope->NewTensor(max_output_name); max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat); max_output_tensor->set_persistable(true); op_desc.SetOutput("OutputMax", {max_output_name}); diff --git a/lite/core/optimizer/mir/fusion/__xpu__conv2d_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__conv2d_fuse_pass.cc index 1e9490036a4..f3838957433 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__conv2d_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__conv2d_fuse_pass.cc @@ -486,8 +486,7 @@ class XPUConv2dFuser : public FuseBase { auto* max_output_node = graph->NewArgumentNode(max_output_name); max_output_node->arg()->type = LiteType::GetTensorTy( TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto* max_output_tensor = - scope->MutableParent()->NewTensor(max_output_name); + auto* max_output_tensor = scope->NewTensor(max_output_name); max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat); max_output_tensor->set_persistable(true); op_desc.SetOutput("OutputMax", {max_output_name}); diff --git a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc index eb31636e636..0e3f3b0335d 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__fc_fuse_pass.cc @@ -147,8 +147,7 @@ class XPUFcFuser : public FuseBase { auto* max_output_node = graph->NewArgumentNode(max_output_name); max_output_node->arg()->type = LiteType::GetTensorTy( TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto* max_output_tensor = - scope->MutableParent()->NewTensor(max_output_name); + auto* max_output_tensor = scope->NewTensor(max_output_name); max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat); max_output_tensor->set_persistable(true); op_desc.SetOutput("OutputMax", {max_output_name}); diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc index 1350d344c38..c78096e9bb4 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc @@ -115,7 +115,7 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { graph->NewArgumentNode(embedding_seq_lod_name); embedding_seq_lod_node->arg()->type = LiteType::GetTensorTy( TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kNCHW)); - scope->MutableParent()->NewTensor(embedding_seq_lod_name); + scope->NewTensor(embedding_seq_lod_name); // add new arg pad_seq_len std::string embedding_pad_seq_len_name = embedding_out_name + "_pad_seq_len"; @@ -123,7 +123,7 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { graph->NewArgumentNode(embedding_pad_seq_len_name); embedding_pad_seq_len_node->arg()->type = LiteType::GetTensorTy( TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kNCHW)); - scope->MutableParent()->NewTensor(embedding_pad_seq_len_name); + scope->NewTensor(embedding_pad_seq_len_name); embedding_op_desc.SetOutput("SeqLod", {embedding_seq_lod_name}); embedding_op_desc.SetOutput("PadSeqLen", {embedding_pad_seq_len_name}); diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_softmax_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_softmax_fuse_pass.cc index 1c81c982dba..deb34c58247 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_softmax_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_softmax_fuse_pass.cc @@ -184,10 +184,9 @@ class XPUMultiSliceSoftmaxFuser { "__xpu__multi_softmax_concat_output_" + in_name; CHECK(graph->RetrieveArgument(concat_output_name) == nullptr); auto* concat_output_node = graph->NewArgumentNode(concat_output_name); - concat_output_node->arg()->is_weight = true; concat_output_node->arg()->type = LiteType::GetTensorTy( TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); - scope->MutableParent()->NewTensor(concat_output_name); + scope->NewTensor(concat_output_name); op_desc.SetOutput("ConcatOut", {concat_output_name}); auto new_op = LiteOpRegistry::Global().Create(op_desc.Type()); diff --git a/lite/core/optimizer/mir/fusion/__xpu__squeeze_excitation_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__squeeze_excitation_fuse_pass.cc index 26159a6416c..13c9000df3d 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__squeeze_excitation_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__squeeze_excitation_fuse_pass.cc @@ -225,8 +225,7 @@ class XPUSqueezeExcitationFuser_DEPREC : public FuseBase { auto* max_output_node = graph->NewArgumentNode(max_output_name); max_output_node->arg()->type = LiteType::GetTensorTy( TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto* max_output_tensor = - scope->MutableParent()->NewTensor(max_output_name); + auto* max_output_tensor = scope->NewTensor(max_output_name); max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat); max_output_tensor->set_persistable(true); op_desc.SetOutput("OutputMax", {max_output_name}); @@ -537,8 +536,7 @@ class XPUSqueezeExcitationFuser : public FuseBase { auto* max_output_node = graph->NewArgumentNode(max_output_name); max_output_node->arg()->type = LiteType::GetTensorTy( TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW)); - auto* max_output_tensor = - scope->MutableParent()->NewTensor(max_output_name); + auto* max_output_tensor = scope->NewTensor(max_output_name); max_output_tensor->set_precision(paddle::lite_api::PrecisionType::kFloat); max_output_tensor->set_persistable(true); op_desc.SetOutput("OutputMax", {max_output_name}); From 1c6a7166592235c0a40bedb2e077b6e1b4c9d822 Mon Sep 17 00:00:00 2001 From: shanliang1992 Date: Mon, 18 Apr 2022 19:57:04 +0800 Subject: [PATCH 04/13] [XPU] fix conv2d lstm maxptr size bug (#8839) --- lite/kernels/xpu/__xpu__conv2d_compute.cc | 3 +++ lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc | 16 ++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc index d99818e02a1..5399fe5182b 100644 --- a/lite/kernels/xpu/__xpu__conv2d_compute.cc +++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc @@ -53,6 +53,9 @@ bool QuantFilter(const float* filter_on_host, template void XPUConv2dCompute::PrepareForRun() { auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + int max_ptr_size = xdnn::get_max_ptr_size(ctx.GetRawContext()); + param.output_max->Resize({max_ptr_size}); auto filter_ptr = param.filter->template data(); auto filter_dims = param.filter->dims(); diff --git a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc index b968034c09d..3c840a49679 100644 --- a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc +++ b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc @@ -26,6 +26,8 @@ namespace xpu { void XPUDynamicLstmCompute::PrepareForRun() { auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + int max_ptr_size = xdnn::get_max_ptr_size(ctx.GetRawContext()); // transpose from weight_0[xdim, 4 * hdim] to transpose_weight_0[4 * hdim, // xdim] @@ -139,22 +141,24 @@ void XPUDynamicLstmCompute::PrepareForRun() { auto weight_0_len = param.weight_0->numel(); float max_weight_0 = paddle::lite::xpu::math::FindMaxAbs(weight_0_ptr, weight_0_len); - std::vector max_weight_0_v(4, max_weight_0); - weight_0_max_ = TargetWrapperXPU::MallocScratchPad(4 * sizeof(float)); + std::vector max_weight_0_v(max_ptr_size, max_weight_0); + weight_0_max_ = + TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float)); float* weight_0_max_addr = reinterpret_cast(weight_0_max_->addr_); XPU_CALL(xpu_memcpy(weight_0_max_addr, max_weight_0_v.data(), - 4 * sizeof(float), + max_ptr_size * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); float max_weight_1 = paddle::lite::xpu::math::FindMaxAbs( param.weight_1->template data(), param.weight_1->numel()); - std::vector max_weight_1_v(4, max_weight_1); - weight_1_max_ = TargetWrapperXPU::MallocScratchPad(4 * sizeof(float)); + std::vector max_weight_1_v(max_ptr_size, max_weight_1); + weight_1_max_ = + TargetWrapperXPU::MallocScratchPad(max_ptr_size * sizeof(float)); float* weight_1_max_addr = reinterpret_cast(weight_1_max_->addr_); XPU_CALL(xpu_memcpy(weight_1_max_addr, max_weight_1_v.data(), - 4 * sizeof(float), + max_ptr_size * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); } From f8798c66d75953e8aeccf0b897c5febd4634a5db Mon Sep 17 00:00:00 2001 From: shanliang1992 Date: Wed, 20 Apr 2022 21:27:46 +0800 Subject: [PATCH 05/13] [XPU] fix lstm bug in xpu (#8886) --- .../xpu/__xpu__dynamic_lstm_compute.cc | 162 ++++++++---------- 1 file changed, 67 insertions(+), 95 deletions(-) diff --git a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc index 3c840a49679..ac796d66326 100644 --- a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc +++ b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc @@ -39,21 +39,21 @@ void XPUDynamicLstmCompute::PrepareForRun() { weight_0_dims[0], weight_0_dims[1]); - // change weight_0 from [w_ix, w_gx, w_fx, w_ox] to [w_ix, w_fx, w_gx, w_ox] + // change weight_0 from [w_gx, w_ix, w_fx, w_ox] to [w_ix, w_fx, w_gx, w_ox] transpose_weight_0_ = TargetWrapperXPU::MallocScratchPad(weight_0_size * sizeof(float)); float* transpose_weight_0_addr = reinterpret_cast(transpose_weight_0_->addr_); XPU_CALL(xpu_memcpy(transpose_weight_0_addr, - cpu_transpose_weight_0.data(), - weight_0_size / 4 * sizeof(float), - XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - XPU_CALL(xpu_memcpy(transpose_weight_0_addr + weight_0_size / 2, cpu_transpose_weight_0.data() + weight_0_size / 4, weight_0_size / 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(transpose_weight_0_addr + weight_0_size / 4, cpu_transpose_weight_0.data() + weight_0_size / 2, + weight_0_size / 4 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(transpose_weight_0_addr + weight_0_size / 2, + cpu_transpose_weight_0.data(), weight_0_size / 2 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(transpose_weight_0_addr + weight_0_size / 4 * 3, @@ -70,21 +70,21 @@ void XPUDynamicLstmCompute::PrepareForRun() { cpu_transpose_weight_1.data(), weight_1_dims[0], weight_1_dims[1]); - // change weight_1 from [w_ih, w_gh, w_fh, w_oh] to [w_ih, w_fh, w_gh, w_oh] + // change weight_1 from [w_gh, w_ih, w_fh, w_oh] to [w_ih, w_fh, w_gh, w_oh] transpose_weight_1_ = TargetWrapperXPU::MallocScratchPad(weight_1_size * sizeof(float)); float* transpose_weight_1_addr = reinterpret_cast(transpose_weight_1_->addr_); XPU_CALL(xpu_memcpy(transpose_weight_1_addr, - cpu_transpose_weight_1.data(), - weight_1_size / 4 * sizeof(float), - XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - XPU_CALL(xpu_memcpy(transpose_weight_1_addr + weight_1_size / 2, cpu_transpose_weight_1.data() + weight_1_size / 4, weight_1_size / 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(transpose_weight_1_addr + weight_1_size / 4, cpu_transpose_weight_1.data() + weight_1_size / 2, + weight_1_size / 4 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(transpose_weight_1_addr + weight_1_size / 2, + cpu_transpose_weight_1.data(), weight_1_size / 2 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(transpose_weight_1_addr + weight_1_size / 4 * 3, @@ -92,21 +92,21 @@ void XPUDynamicLstmCompute::PrepareForRun() { weight_1_size / 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - // change bias_0 from [b_ix, b_gx, b_fx, b_ox] to [b_ix, b_fx, b_gx, b_ox] + // change bias_0 from [b_gx, b_ix, b_fx, b_ox] to [b_ix, b_fx, b_gx, b_ox] const float* bias_0 = param.bias_0->template data(); int bias_0_size = param.bias_0->numel(); bias_0_ = TargetWrapperXPU::MallocScratchPad(bias_0_size * sizeof(float)); float* bias_0_addr = reinterpret_cast(bias_0_->addr_); XPU_CALL(xpu_memcpy(bias_0_addr, - bias_0, - bias_0_size / 4 * sizeof(float), - XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - XPU_CALL(xpu_memcpy(bias_0_addr + bias_0_size / 2, bias_0 + bias_0_size / 4, bias_0_size / 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(bias_0_addr + bias_0_size / 4, bias_0 + bias_0_size / 2, + bias_0_size / 4 * sizeof(float), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(bias_0_addr + bias_0_size / 2, + bias_0, bias_0_size / 2 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(bias_0_addr + bias_0_size / 4 * 3, @@ -114,13 +114,13 @@ void XPUDynamicLstmCompute::PrepareForRun() { bias_0_size / 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - // change bias_1 from [b_ix, b_gx, b_fx, b_ox] to [b_ix, b_fx, b_gx, b_ox] + // change bias_1 from [b_gx, b_ix, b_fx, b_ox] to [b_ix, b_fx, b_gx, b_ox] const float* bias_1 = param.bias_1->template data(); int bias_1_size = param.bias_1->numel(); bias_1_ = TargetWrapperXPU::MallocScratchPad(bias_1_size * sizeof(float)); float* bias_1_addr = reinterpret_cast(bias_1_->addr_); XPU_CALL(xpu_memcpy(bias_1_addr, - bias_1, + bias_1 + bias_1_size / 4, bias_1_size / 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(bias_1_addr + bias_1_size / 4, @@ -128,7 +128,7 @@ void XPUDynamicLstmCompute::PrepareForRun() { bias_1_size / 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(bias_1_addr + bias_1_size / 2, - bias_1 + bias_1_size / 4, + bias_1, bias_1_size / 2 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE)); XPU_CALL(xpu_memcpy(bias_1_addr + bias_1_size / 4 * 3, @@ -185,62 +185,38 @@ void XPUDynamicLstmCompute::Run() { float* in_batch_tensor_addr = reinterpret_cast(in_batch_tensor->addr_); - // prepare lod and reverse lod - auto xpu_lod = - TargetWrapperXPU::MallocScratchPad(int_lod.size() * sizeof(int)); - int* xpu_lod_addr = reinterpret_cast(xpu_lod->addr_); - XPU_CALL(xpu_memcpy(xpu_lod_addr, - int_lod.data(), - int_lod.size() * sizeof(int), - XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - std::vector reverse_int_lod(int_lod.size()); - auto xpu_reverse_int_lod = - TargetWrapperXPU::MallocScratchPad(reverse_int_lod.size() * sizeof(int)); - int* xpu_reverse_int_lod_addr = - reinterpret_cast(xpu_reverse_int_lod->addr_); - // reverse input if is_reverse = true if (is_reverse) { auto reverse_input = TargetWrapperXPU::MallocScratchPad( param.input->numel() * sizeof(float)); float* reverse_input_addr = reinterpret_cast(reverse_input->addr_); - int r = xdnn::sequence_reverse(ctx.GetRawContext(), - input_addr, - xpu_lod_addr, - reverse_input_addr, - batch_size, - xdim); + int r = xdnn::sequence_reverse( + ctx.GetRawContext(), + input_addr, + reverse_input_addr, + {int_lod.data(), static_cast(int_lod.size()), nullptr}, + xdim); CHECK_EQ(r, 0); - std::reverse(seq_len_tensor.begin(), seq_len_tensor.end()); - - // get reverse lod tensor - reverse_int_lod[0] = 0; - for (int i = 0; i < seq_len_tensor.size(); i++) { - reverse_int_lod[i + 1] = reverse_int_lod[i] + seq_len_tensor[i]; - } - XPU_CALL(xpu_memcpy(xpu_reverse_int_lod_addr, - reverse_int_lod.data(), - reverse_int_lod.size() * sizeof(int), - XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - - r = xdnn::sequence_pad(ctx.GetRawContext(), - reverse_input_addr, - xpu_reverse_int_lod_addr, - in_batch_tensor_addr, - batch_size, - max_seq_len, - xdim, - 0); + r = xdnn::sequence_pad( + ctx.GetRawContext(), + reverse_input_addr, + in_batch_tensor_addr, + {int_lod.data(), static_cast(int_lod.size()), nullptr}, + batch_size, + max_seq_len, + xdim, + 0); CHECK_EQ(r, 0); } else { - int r = xdnn::sequence_pad(ctx.GetRawContext(), - input_addr, - xpu_lod_addr, - in_batch_tensor_addr, - batch_size, - max_seq_len, - xdim, - 0); + int r = xdnn::sequence_pad( + ctx.GetRawContext(), + input_addr, + in_batch_tensor_addr, + {int_lod.data(), static_cast(int_lod.size()), nullptr}, + batch_size, + max_seq_len, + xdim, + 0); CHECK_EQ(r, 0); } @@ -289,29 +265,28 @@ void XPUDynamicLstmCompute::Run() { const float* weight_0_maxptr = reinterpret_cast(weight_0_max_->addr_); const float* weight_1_maxptr = reinterpret_cast(weight_1_max_->addr_); - r = xdnn::lstm_train_for_old_paddle( - ctx.GetRawContext(), - transpose_in_addr, - h0, - c0, - transpose_weight_0_addr, - transpose_weight_1_addr, - bias_0_addr, - bias_1_addr, - transpose_out_addr, - last_h_addr, - last_c_addr, - batch_size, - xdim, - hdim, - max_seq_len, - seq_len_tensor, - nullptr, - nullptr, - weight_0_maxptr, - weight_1_maxptr, - i_f_g_o_addr, - c_addr); + r = xdnn::lstm_train(ctx.GetRawContext(), + transpose_in_addr, + h0, + c0, + transpose_weight_0_addr, + transpose_weight_1_addr, + bias_0_addr, + bias_1_addr, + transpose_out_addr, + last_h_addr, + last_c_addr, + batch_size, + xdim, + hdim, + max_seq_len, + seq_len_tensor, + nullptr, + nullptr, + weight_0_maxptr, + weight_1_maxptr, + i_f_g_o_addr, + c_addr); CHECK_EQ(r, 0); // transpose from transpose_out[seq_len, batch_size, hdim] to @@ -337,9 +312,7 @@ void XPUDynamicLstmCompute::Run() { ctx.GetRawContext(), out_batch_tensor_addr, reverse_output_addr, - {reverse_int_lod.data(), - static_cast(reverse_int_lod.size()), - xpu_reverse_int_lod_addr}, + {int_lod.data(), static_cast(int_lod.size()), nullptr}, max_seq_len, hdim); CHECK_EQ(r, 0); @@ -347,9 +320,8 @@ void XPUDynamicLstmCompute::Run() { r = xdnn::sequence_reverse( ctx.GetRawContext(), reverse_output_addr, - xpu_reverse_int_lod_addr, param.hidden->template mutable_data(TARGET(kXPU)), - batch_size, + {int_lod.data(), static_cast(int_lod.size()), nullptr}, hdim); CHECK_EQ(r, 0); } else { @@ -357,7 +329,7 @@ void XPUDynamicLstmCompute::Run() { ctx.GetRawContext(), out_batch_tensor_addr, param.hidden->template mutable_data(TARGET(kXPU)), - {int_lod.data(), static_cast(int_lod.size()), xpu_lod_addr}, + {int_lod.data(), static_cast(int_lod.size()), nullptr}, max_seq_len, hdim); CHECK_EQ(r, 0); From c26cea689b5ad94d6d3c23b13694af4c1f9d3d3e Mon Sep 17 00:00:00 2001 From: Liuyinfeng <30849840+gitliuyf@users.noreply.github.com> Date: Thu, 21 Apr 2022 22:01:33 +0800 Subject: [PATCH 06/13] [xpu]:support matmul_v2 to encoder adaptive seqlen (#8883) --- ...multi_encoder_adaptive_seqlen_fuse_pass.cc | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc index c78096e9bb4..f15667422bb 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_adaptive_seqlen_fuse_pass.cc @@ -60,14 +60,18 @@ namespace fusion { class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { public: + explicit XPUMultiEncoderAdaptiveSeqlenFuser( + const std::string& matmul_type = "matmul") + : matmul_type_(matmul_type) {} + void BuildPattern() override { auto* mask = VarNode("mask") - ->assert_is_op_input("matmul", "X") - ->assert_is_op_input("matmul", "Y"); - auto* matmul = OpNode("matmul", "matmul")->AsIntermediate(); + ->assert_is_op_input(matmul_type_, "X") + ->assert_is_op_input(matmul_type_, "Y"); + auto* matmul = OpNode("matmul", matmul_type_)->AsIntermediate(); auto* matmul_out = VarNode("matmul_out") ->assert_is_op_input("scale", "X") - ->assert_is_op_output("matmul", "Out") + ->assert_is_op_output(matmul_type_, "Out") ->AsIntermediate(); auto* scale = OpNode("scale", "scale")->AsIntermediate(); auto* scale_out = VarNode("scale_out") @@ -140,6 +144,9 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { DirectedLink(embedding_seq_lod_node, matched.at("xpu_encoder")); DirectedLink(embedding_pad_seq_len_node, matched.at("xpu_encoder")); } + + private: + std::string matmul_type_; }; } // namespace fusion @@ -147,8 +154,11 @@ class XPUMultiEncoderAdaptiveSeqlenFuser : public FuseBase { class XPUMultiEncoderAdaptiveSeqlenFusePass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override { - fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser; - fuser(graph.get()); + std::vector matmul_types{"matmul", "matmul_v2"}; + for (auto& matmul_type : matmul_types) { + fusion::XPUMultiEncoderAdaptiveSeqlenFuser fuser(matmul_type); + fuser(graph.get()); + } } }; From a87d3429c6fa170aa3534eab353dda482771a69e Mon Sep 17 00:00:00 2001 From: quwei03 <32065370+xiuxin121@users.noreply.github.com> Date: Sun, 24 Apr 2022 15:18:24 +0800 Subject: [PATCH 07/13] [xpu] proposal compute support in kl2 (#8919) --- .../kernels/xpu/generate_proposals_compute.cc | 72 ++++++++++--------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/lite/kernels/xpu/generate_proposals_compute.cc b/lite/kernels/xpu/generate_proposals_compute.cc index 2e1a89e0b28..2fef3b78c5d 100644 --- a/lite/kernels/xpu/generate_proposals_compute.cc +++ b/lite/kernels/xpu/generate_proposals_compute.cc @@ -133,39 +133,48 @@ void GenerateProposalsCompute::Run() { float* props_after_filter = box_decoder_pros + K * 4; float* scores_after_filter = topk_scores + K; int* index_after_nms = remove_small_boxes_idx + K; - int* nms_n_keep = remove_small_boxes_n_keep + 1; - // TODO(weihaoji) : Change TOPK Impl to XPU Version + // TODO(quwei) : Change TOPK Impl to XPU Version(k1) // Since XPU Topk Only Support K <= 512, Select CPU Version Right Now - // r = xdnn::sorted_topk(ctx.GetRawContext(), - // trans_scores + batch_idx * M, - // topk_scores, - // topk_indices, 1, M, K); - std::vector tmp_scores_cpu(M, 0); - std::vector topk_indices_cpu(K, 0); - std::vector topk_scores_cpu(K, 0); - TargetWrapperXPU::MemcpySync(tmp_scores_cpu.data(), - trans_scores + batch_idx * M, - sizeof(float) * M, - IoDirection::DtoH); + if ((K <= 512 && ctx.GetRawContext()->dev().type() == xdnn::kXPU1) || + (K <= 6400 && ctx.GetRawContext()->dev().type() == xdnn::kXPU2)) { + r = xdnn::sorted_topk(ctx.GetRawContext(), + trans_scores + batch_idx * M, + topk_scores, + topk_indices, + 1, + M, + K, + true); + } else { + std::vector tmp_scores_cpu(M, 0); + std::vector topk_indices_cpu(K, 0); + std::vector topk_scores_cpu(K, 0); + + TargetWrapperXPU::MemcpySync(tmp_scores_cpu.data(), + trans_scores + batch_idx * M, + sizeof(float) * M, + IoDirection::DtoH); - xdnn::Context ctx_cpu(xdnn::kCPU); - r = xdnn::sorted_topk(&ctx_cpu, - tmp_scores_cpu.data(), + xdnn::Context ctx_cpu(xdnn::kCPU); + r = xdnn::sorted_topk(&ctx_cpu, + tmp_scores_cpu.data(), + topk_scores_cpu.data(), + topk_indices_cpu.data(), + 1, + M, + K, + true); + CHECK_EQ(r, 0); + XPU_CALL(xpu_memcpy(topk_scores, topk_scores_cpu.data(), + sizeof(float) * K, + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + XPU_CALL(xpu_memcpy(topk_indices, topk_indices_cpu.data(), - 1, - M, - K); - CHECK_EQ(r, 0); - XPU_CALL(xpu_memcpy(topk_scores, - topk_scores_cpu.data(), - sizeof(float) * K, - XPUMemcpyKind::XPU_HOST_TO_DEVICE)); - XPU_CALL(xpu_memcpy(topk_indices, - topk_indices_cpu.data(), - sizeof(float) * K, - XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + sizeof(float) * K, + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + } // gather r = xdnn::gather(ctx.GetRawContext(), @@ -242,16 +251,15 @@ void GenerateProposalsCompute::Run() { 0); CHECK_EQ(r, 0); // NMS + int nms_n_keep_cpu = -1; r = xdnn::sorted_nms(ctx.GetRawContext(), props_after_filter, index_after_nms, - nms_n_keep, + nms_n_keep_cpu, remove_small_boxes_n_keep_cpu, nms_thresh); CHECK_EQ(r, 0); - int nms_n_keep_cpu = 0; - TargetWrapperXPU::MemcpySync( - &nms_n_keep_cpu, nms_n_keep, sizeof(int), IoDirection::DtoH); + nms_n_keep_cpu = std::min(nms_n_keep_cpu, post_nms_top_n); // Gather After NMS r = xdnn::gather(ctx.GetRawContext(), From 93c69c69882925d214c4a749dd08b5a5fdbb50c7 Mon Sep 17 00:00:00 2001 From: Gradie <1099562076@qq.com> Date: Wed, 27 Apr 2022 21:38:27 +0800 Subject: [PATCH 08/13] [XPU] update lstm_inference api (#8951) --- .../xpu/__xpu__dynamic_lstm_compute.cc | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc index ac796d66326..56fed1a8f9b 100644 --- a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc +++ b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc @@ -172,9 +172,11 @@ void XPUDynamicLstmCompute::Run() { paddle::lite::LoD in_lods = param.input->lod(); std::vector int_lod(in_lods[0].begin(), in_lods[0].end()); std::vector seq_len_tensor(in_lods[0].size() - 1); + std::vector seq_len_tensor_64(in_lods[0].size() - 1); int max_seq_len = 0; for (int i = 0; i < int_lod.size() - 1; i++) { seq_len_tensor[i] = int_lod[i + 1] - int_lod[i]; + seq_len_tensor_64[i] = int_lod[i + 1] - int_lod[i]; max_seq_len = std::max(max_seq_len, seq_len_tensor[i]); } int batch_size = seq_len_tensor.size(); @@ -255,38 +257,36 @@ void XPUDynamicLstmCompute::Run() { auto last_c = TargetWrapperXPU::MallocScratchPad(batch_size * hdim * sizeof(float)); float* last_c_addr = reinterpret_cast(last_c->addr_); - auto i_f_g_o = TargetWrapperXPU::MallocScratchPad(max_seq_len * batch_size * - hdim * 4 * sizeof(float)); - float* i_f_g_o_addr = reinterpret_cast(i_f_g_o->addr_); - auto c = TargetWrapperXPU::MallocScratchPad(max_seq_len * batch_size * hdim * - sizeof(float)); - float* c_addr = reinterpret_cast(c->addr_); const float* weight_0_maxptr = reinterpret_cast(weight_0_max_->addr_); const float* weight_1_maxptr = reinterpret_cast(weight_1_max_->addr_); - r = xdnn::lstm_train(ctx.GetRawContext(), - transpose_in_addr, - h0, - c0, - transpose_weight_0_addr, - transpose_weight_1_addr, - bias_0_addr, - bias_1_addr, - transpose_out_addr, - last_h_addr, - last_c_addr, - batch_size, - xdim, - hdim, - max_seq_len, - seq_len_tensor, - nullptr, - nullptr, - weight_0_maxptr, - weight_1_maxptr, - i_f_g_o_addr, - c_addr); + auto x_seq_len_guard = + TargetWrapperXPU::MallocScratchPad(batch_size * sizeof(int64_t)); + int64_t* x_seq_len = reinterpret_cast(x_seq_len_guard->addr_); + XPU_CALL(xpu_memcpy(x_seq_len, + seq_len_tensor_64.data(), + batch_size * sizeof(int64_t), + XPUMemcpyKind::XPU_HOST_TO_DEVICE)); + r = xdnn::lstm_inference(ctx.GetRawContext(), + max_seq_len, + batch_size, + xdim, + hdim, + false, + transpose_in_addr, + h0, + c0, + x_seq_len, + transpose_weight_0_addr, + weight_0_maxptr, + transpose_weight_1_addr, + weight_1_maxptr, + bias_0_addr, + bias_1_addr, + transpose_out_addr, + last_h_addr, + last_c_addr); CHECK_EQ(r, 0); // transpose from transpose_out[seq_len, batch_size, hdim] to From c63dcf6eb1603de75adea3ee21610b020fcf52a9 Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Mon, 9 May 2022 13:31:03 +0800 Subject: [PATCH 09/13] [XPU] support xdnn/xre url (#8989) --- cmake/backends/xpu.cmake | 19 ++++++++++++------- lite/tools/build_linux.sh | 20 +++++++++++++++++++- lite/tools/ci_test.sh | 18 ++++++++++++++++-- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/cmake/backends/xpu.cmake b/cmake/backends/xpu.cmake index 795b034b674..3dd9acb7f3d 100644 --- a/cmake/backends/xpu.cmake +++ b/cmake/backends/xpu.cmake @@ -39,15 +39,21 @@ if (NOT XPU_SDK_ENV) endif () endif () -macro (prepare_xpu_sdk sdk) - set (xpu_${sdk}_url "${XPU_SDK_URL}/${sdk}-${XPU_SDK_ENV}.tar.gz") - message (STATUS "xpu_${sdk}_url: ${xpu_${sdk}_url}") +if (NOT XPU_XDNN_URL) + set (XPU_XDNN_URL "${XPU_SDK_URL}/xdnn-${XPU_SDK_ENV}.tar.gz") +endif () +message (STATUS "XPU_XDNN_URL: ${XPU_XDNN_URL}") +if (NOT XPU_XRE_URL) + set (XPU_XRE_URL "${XPU_SDK_URL}/xre-${XPU_SDK_ENV}.tar.gz") +endif () +message (STATUS "XPU_XRE_URL: ${XPU_XRE_URL}") +macro (prepare_xpu_sdk sdk sdk_url) ExternalProject_Add ( extern_xpu_${sdk} ${EXTERNAL_PROJECT_LOG_ARGS} DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate -c -q ${xpu_${sdk}_url} && tar xf ${sdk}-${XPU_SDK_ENV}.tar.gz + DOWNLOAD_COMMAND wget --no-check-certificate -c -q ${sdk_url} -O ${sdk}.tar.gz && tar xf ${sdk}.tar.gz CONFIGURE_COMMAND "" BUILD_COMMAND "" UPDATE_COMMAND "" @@ -56,7 +62,6 @@ macro (prepare_xpu_sdk sdk) set (xpu_${sdk}_root "${XPU_INSTALL_DIR}/xpu/${sdk}" CACHE PATH "xpu ${sdk} include directory" FORCE) set (xpu_${sdk}_include_dir "${xpu_${sdk}_root}/include" CACHE PATH "xpu ${sdk} include directory" FORCE) - include_directories (${xpu_${sdk}_include_dir}) foreach (lib ${ARGN}) @@ -68,8 +73,8 @@ macro (prepare_xpu_sdk sdk) endmacro () if (NOT XPU_SDK_ROOT) - prepare_xpu_sdk (xdnn xpuapi) - prepare_xpu_sdk (xre xpurt) + prepare_xpu_sdk (xdnn ${XPU_XDNN_URL} xpuapi) + prepare_xpu_sdk (xre ${XPU_XRE_URL} xpurt) set (xpu_builder_libs xpuapi CACHE INTERNAL "xpu builder libs") set (xpu_runtime_libs xpurt CACHE INTERNAL "xpu runtime libs") return () diff --git a/lite/tools/build_linux.sh b/lite/tools/build_linux.sh index f61f5420157..2fc2c3cb5a9 100755 --- a/lite/tools/build_linux.sh +++ b/lite/tools/build_linux.sh @@ -68,6 +68,8 @@ NNADAPTER_GOOGLE_XNNPACK_SRC_GIT_TAG="master" # options of compiling baidu XPU lib. WITH_KUNLUNXIN_XPU=OFF KUNLUNXIN_XPU_SDK_URL="" +KUNLUNXIN_XPU_XDNN_URL="" +KUNLUNXIN_XPU_XRE_URL="" KUNLUNXIN_XPU_SDK_ENV="" KUNLUNXIN_XPU_SDK_ROOT="" # options of compiling intel fpga. @@ -200,6 +202,8 @@ function init_cmake_mutable_options { -DRKNPU_DDK_ROOT=$ROCKCHIP_NPU_SDK_ROOT \ -DLITE_WITH_XPU=$WITH_KUNLUNXIN_XPU \ -DXPU_SDK_URL=$KUNLUNXIN_XPU_SDK_URL \ + -DXPU_XDNN_URL=$KUNLUNXIN_XPU_XDNN_URL \ + -DXPU_XRE_URL=$KUNLUNXIN_XPU_XRE_URL \ -DXPU_SDK_ENV=$KUNLUNXIN_XPU_SDK_ENV \ -DXPU_SDK_ROOT=$KUNLUNXIN_XPU_SDK_ROOT \ -DLITE_WITH_TRAIN=$WITH_TRAIN \ @@ -416,7 +420,13 @@ function print_usage { echo -e "| ./lite/tools/build_linux.sh --arch=armv8 --with_kunlunxin_xpu=ON |" echo -e "| --with_kunlunxin_xpu: (OFF|ON); controls whether to compile lib for kunlunxin_xpu, default is OFF. |" echo -e "| --kunlunxin_xpu_sdk_url: (kunlunxin_xpu sdk download url) optional, default is |" - echo -e "| 'https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev_paddle' |" + echo -e "| 'https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev_paddle'. |" + echo -e "| 'xdnn' and 'xre' will be download from kunlunxin_xpu_sdk_url, so you don't |" + echo -e "| need to specify 'kunlunxin_xpu_xdnn_url' or 'kunlunxin_xpu_xre_url' separately. |" + echo -e "| --kunlunxin_xpu_xdnn_url: (kunlunxin_xpu xdnn download url) optional, default is empty. |" + echo -e "| It has higher priority than 'kunlunxin_xpu_sdk_url' |" + echo -e "| --kunlunxin_xpu_xre_url: (kunlunxin_xpu xre download url) optional, default is empty. |" + echo -e "| It has higher priority than 'kunlunxin_xpu_sdk_url' |" echo -e "| --kunlunxin_xpu_sdk_env: (bdcentos_x86_64|centos7_x86_64|ubuntu_x86_64|kylin_aarch64) optional, |" echo -e "| default is bdcentos_x86_64(if x86) / kylin_aarch64(if arm) |" echo -e "| --kunlunxin_xpu_sdk_root: (path to kunlunxin_xpu DDK file) optional, default is None |" @@ -657,6 +667,14 @@ function main { KUNLUNXIN_XPU_SDK_URL="${i#*=}" shift ;; + --kunlunxin_xpu_xdnn_url=*) + KUNLUNXIN_XPU_XDNN_URL="${i#*=}" + shift + ;; + --kunlunxin_xpu_xre_url=*) + KUNLUNXIN_XPU_XRE_URL="${i#*=}" + shift + ;; --kunlunxin_xpu_sdk_env=*) KUNLUNXIN_XPU_SDK_ENV="${i#*=}" shift diff --git a/lite/tools/ci_test.sh b/lite/tools/ci_test.sh index bff26e44880..b5f8e9f6ccb 100755 --- a/lite/tools/ci_test.sh +++ b/lite/tools/ci_test.sh @@ -39,6 +39,8 @@ REMOTE_DEVICE_LIST="2GX0119401000796,0123456789ABCDEF" REMOTE_DEVICE_WORK_DIR="/data/local/tmp" # Xpu sdk option XPU_SDK_URL="" +XPU_XDNN_URL="" +XPU_XRE_URL="" XPU_SDK_ENV="" XPU_SDK_ROOT="" @@ -611,6 +613,8 @@ function baidu_xpu_build_and_test() { local unit_test_filter_type=$3 local sdk_url=$4 local sdk_env=$5 + local xdnn_url=$6 + local xre_url=$7 # Build all of unittests and model tests cur_dir=$(pwd) @@ -633,6 +637,8 @@ function baidu_xpu_build_and_test() { -DLITE_WITH_XPU=ON \ -DLITE_WITH_LTO=OFF \ -DXPU_SDK_URL=$sdk_url \ + -DXPU_XDNN_URL=$xdnn_url \ + -DXPU_XRE_URL=$xre_url \ -DXPU_SDK_ENV=$sdk_env \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT @@ -711,6 +717,14 @@ function main() { XPU_SDK_ENV="${i#*=}" shift ;; + --xpu_xdnn_url=*) + XPU_XDNN_URL="${i#*=}" + shift + ;; + --xpu_xre_url=*) + XPU_XRE_URL="${i#*=}" + shift + ;; --xpu_sdk_root=*) XPU_SDK_ROOT="${i#*=}" shift @@ -736,11 +750,11 @@ function main() { shift ;; baidu_xpu_disable_xtcl_build_and_test) - baidu_xpu_build_and_test OFF $UNIT_TEST_CHECK_LIST $UNIT_TEST_FILTER_TYPE $XPU_SDK_URL $XPU_SDK_ENV + baidu_xpu_build_and_test OFF $UNIT_TEST_CHECK_LIST $UNIT_TEST_FILTER_TYPE $XPU_SDK_URL $XPU_SDK_ENV $XPU_XDNN_URL $XPU_XRE_URL shift ;; baidu_xpu_enable_xtcl_build_and_test) - baidu_xpu_build_and_test ON $UNIT_TEST_CHECK_LIST $UNIT_TEST_FILTER_TYPE $XPU_SDK_URL $XPU_SDK_ENV + baidu_xpu_build_and_test ON $UNIT_TEST_CHECK_LIST $UNIT_TEST_FILTER_TYPE $XPU_SDK_URL $XPU_SDK_ENV $XPU_XDNN_URL $XPU_XRE_URL shift ;; *) From 5abf5cc17d5e89e88cea6f6b07911ed4c9304416 Mon Sep 17 00:00:00 2001 From: shentanyue <34421038+shentanyue@users.noreply.github.com> Date: Wed, 18 May 2022 22:09:50 +0800 Subject: [PATCH 10/13] [XPU] Fix gather op index_type not compatible error (#9031) --- lite/kernels/xpu/gather_compute.cc | 99 +++++++++++++++++++++++++++--- lite/kernels/xpu/gather_compute.h | 2 + 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/lite/kernels/xpu/gather_compute.cc b/lite/kernels/xpu/gather_compute.cc index b7f6ec0da83..f3eafc878fb 100644 --- a/lite/kernels/xpu/gather_compute.cc +++ b/lite/kernels/xpu/gather_compute.cc @@ -46,16 +46,88 @@ void GatherCompute::Run() { axis += x_dims.size(); } - int r = xdnn::gather( - ctx.GetRawContext(), - x->template data(), - index->template data(), - out->template mutable_data(TARGET(kXPU)), - x_dims, - index->numel(), - axis); + if (param.X->precision() == PrecisionType::kInt64 && + param.Index->precision() == PrecisionType::kInt64) { + auto* index_int64 = param.Index->template data(); + int size = param.Index->dims().production(); + XPUScratchPadGuard index_xpu_guard_ = + TargetWrapperXPU::MallocScratchPad(size * sizeof(int)); + int* index_int32_device = reinterpret_cast(index_xpu_guard_->addr_); - CHECK_EQ(r, 0); + int r0 = xdnn::cast_v2( + ctx.GetRawContext(), index_int64, index_int32_device, index->numel()); + CHECK_EQ(r0, 0); + + int r1 = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index_int32_device, + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); + CHECK_EQ(r1, 0); + } else if (param.X->precision() == PrecisionType::kInt64 && + param.Index->precision() == PrecisionType::kInt32) { + int r = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index->template data(), + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); + CHECK_EQ(r, 0); + } else if (param.X->precision() == PrecisionType::kInt32 && + param.Index->precision() == PrecisionType::kInt32) { + int r = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index->template data(), + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); + CHECK_EQ(r, 0); + } else if (param.X->precision() == PrecisionType::kInt32 && + param.Index->precision() == PrecisionType::kInt64) { + int r = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index->template data(), + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); + CHECK_EQ(r, 0); + } else if (param.X->precision() == PrecisionType::kFloat && + param.Index->precision() == PrecisionType::kInt32) { + int r = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index->template data(), + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); + CHECK_EQ(r, 0); + } else if (param.X->precision() == PrecisionType::kFloat && + param.Index->precision() == PrecisionType::kInt64) { + int r = xdnn::gather( + ctx.GetRawContext(), + x->template data(), + index->template data(), + out->template mutable_data(TARGET(kXPU)), + x_dims, + index->numel(), + axis); + CHECK_EQ(r, 0); + } else { + LOG(FATAL) << "Unsupported gather op with x dtype: " + << lite_api::PrecisionToStr(param.X->precision()) + << " and index dtype: " + << lite_api::PrecisionToStr(param.Index->precision()); + } } } // namespace xpu @@ -107,3 +179,12 @@ REGISTER_LITE_KERNEL( {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) .Finalize(); +REGISTER_LITE_KERNEL( + gather, kXPU, kFloat, kNCHW, GatherXPUInt64Int64, gather_i64_i64) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .BindInput("Axis", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/xpu/gather_compute.h b/lite/kernels/xpu/gather_compute.h index a557b0c1a3b..a78be677d09 100644 --- a/lite/kernels/xpu/gather_compute.h +++ b/lite/kernels/xpu/gather_compute.h @@ -46,3 +46,5 @@ typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUFloatInt64; typedef paddle::lite::kernels::xpu::GatherCompute GatherXPUInt64Int32; +typedef paddle::lite::kernels::xpu::GatherCompute + GatherXPUInt64Int64; From 3de4c61c0f5ac498c39489ad76c66874fd1ca9ca Mon Sep 17 00:00:00 2001 From: Liuyinfeng <30849840+gitliuyf@users.noreply.github.com> Date: Mon, 30 May 2022 10:50:59 +0800 Subject: [PATCH 11/13] [xpu] fix slice dims resize (#9057) --- lite/kernels/xpu/slice_compute.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lite/kernels/xpu/slice_compute.cc b/lite/kernels/xpu/slice_compute.cc index 9e131aa4b49..6353d6114e6 100644 --- a/lite/kernels/xpu/slice_compute.cc +++ b/lite/kernels/xpu/slice_compute.cc @@ -172,6 +172,7 @@ void SliceCompute::Run() { int end = ends[i] < 0 ? ends[i] + static_cast(in_dims[axis]) : ends[i]; x_dim_end_[axis] = (std::min)(end, static_cast(in_dims[axis])); } + out->Resize(out_dims); int r = xdnn::slice(ctx.GetRawContext(), /* context */ From abd793f364152cf0a28aab4101e3578dd2f4638f Mon Sep 17 00:00:00 2001 From: newway <237745+newway@users.noreply.github.com> Date: Tue, 31 May 2022 10:16:32 +0800 Subject: [PATCH 12/13] [XPU] Matmul quant (#9068) --- lite/backends/xpu/target_wrapper.h | 10 ++ .../fusion/__xpu__multi_encoder_fuse_pass.cc | 131 ++++++++++++------ .../xpu/__xpu__multi_encoder_compute.cc | 72 +++++++--- .../xpu/__xpu__multi_encoder_compute.h | 4 + 4 files changed, 158 insertions(+), 59 deletions(-) diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h index d3b70463716..ba8086c0414 100644 --- a/lite/backends/xpu/target_wrapper.h +++ b/lite/backends/xpu/target_wrapper.h @@ -91,6 +91,16 @@ class TargetWrapper { CHECK(xpu_stream_.get()); } tls_raw_ctx_.get()->xpu_stream = xpu_stream_.get(); + if (tls_raw_ctx_.get()->dev().type() == xdnn::kXPU1) { + LOG(INFO) << "running in KunLun1"; + } else if (tls_raw_ctx_.get()->dev().type() == xdnn::kXPU2) { + LOG(INFO) << "running in KunLun2"; + } else if (tls_raw_ctx_.get()->dev().type() == xdnn::kXPU3) { + LOG(INFO) << "running in KunLun3"; + } else { + LOG(FATAL) << "running in unknown XPU device: " + << static_cast(tls_raw_ctx_.get()->dev().type()); + } LOG(INFO) << "thread 0x" << std::hex << std::this_thread::get_id() << " set context xpu stream: " << xpu_stream_.get(); if (l3_planner_ == nullptr) { diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc index eadf879032a..0b2516a3bf4 100644 --- a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc +++ b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -508,43 +508,15 @@ class XPUSingleEncoderFuser : public FuseBase { CHECK_EQ(q_mul_y_shape[0], qkv_mul_y_shape[1]); CHECK_EQ(q_mul_y_shape[1], qkv_mul_y_shape[0]); CHECK_GT(hidden_dim, 0) << "invalid hidden_dim: " << hidden_dim; + // mul input_max, output_max * 6 + matmul x_max,y_max,output_max * 2 + std::vector fc_input_max; + set_quant_info(matched, &fc_input_max); + // mul & matmul input/output max + op_desc.SetAttr>("fc_input_max", fc_input_max); + if (q_mul_op_info->HasAttr("enable_int8") && q_mul_op_info->GetAttr("enable_int8")) { op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr>( - "X0_max", - { - 127 * - matched.at("q_mul") - ->stmt() - ->op_info() - ->GetAttr>("X0_scale")[0], - 127 * - matched.at("k_mul") - ->stmt() - ->op_info() - ->GetAttr>("X0_scale")[0], - 127 * - matched.at("v_mul") - ->stmt() - ->op_info() - ->GetAttr>("X0_scale")[0], - 127 * - matched.at("qkv_mul") - ->stmt() - ->op_info() - ->GetAttr>("X0_scale")[0], - 127 * - matched.at("qkv_mul_3") - ->stmt() - ->op_info() - ->GetAttr>("X0_scale")[0], - 127 * - matched.at("qkv_mul_4") - ->stmt() - ->op_info() - ->GetAttr>("X0_scale")[0], - }); op_desc.SetAttr>( "Y0_max", { @@ -579,7 +551,7 @@ class XPUSingleEncoderFuser : public FuseBase { ->op_info() ->GetAttr>("Y0_scale")[0], }); - VLOG(3) << "q/k/v 127*y0_scale: " + VLOG(3) << "q/k/v weight_max: " << 127 * matched.at("q_mul") ->stmt() @@ -672,6 +644,82 @@ class XPUSingleEncoderFuser : public FuseBase { std::string mul_type_; bool with_q_scale_; bool norm_before_; + // quant_info: mul input_max, output_max * 6 + matmul x_max:y_max, output_max + // * 2 + void set_quant_info(const key2nodes_t& matched, + std::vector* quant_info) { + const std::vector quant_mul_ops = { + "q_mul", "k_mul", "v_mul", "qkv_mul", "qkv_mul_3", "qkv_mul_4"}; + const std::vector mul_add_ops = { + "q_add", "k_add", "v_add", "qkv_add", "qkv_add_3", "qkv_add_4"}; + const std::vector matmul_ops = {"qk_matmul", "qkv_matmul"}; + + auto* q_mul_op_info = matched.at("q_mul")->stmt()->op_info(); + const bool mul_quant = q_mul_op_info->HasAttr("enable_int8") && + q_mul_op_info->GetAttr("enable_int8"); + auto* qk_matmul_op_info = matched.at("qk_matmul")->stmt()->op_info(); + const bool matmul_quant = qk_matmul_op_info->HasAttr("enable_int8") && + qk_matmul_op_info->GetAttr("enable_int8"); + if (!mul_quant && !matmul_quant) { + VLOG(3) << "no quantized op"; + return; + } else { + VLOG(3) << "mul quantized: " << mul_quant + << ", matmul quantized: " << matmul_quant; + } + for (int i = 0; mul_quant && (i < quant_mul_ops.size()); ++i) { + auto& quant_mul = quant_mul_ops[i]; + quant_info->push_back( + 127 * + matched.at(quant_mul)->stmt()->op_info()->GetAttr>( + "X0_scale")[0]); + // ew_add out_threshold for output quant + auto& quant_ew = mul_add_ops[i]; + quant_info->push_back( + matched.at(quant_ew)->stmt()->op_info()->GetAttr( + "out_threshold")); + VLOG(3) << quant_mul << " input_max: " << (*quant_info)[i * 2] + << ", output_max(ew_add): " << (*quant_info)[i * 2 + 1]; + } + CHECK_EQ(quant_info->size(), 12); + float max_qkv_input = std::max((*quant_info)[0], (*quant_info)[2]); + max_qkv_input = std::max(max_qkv_input, (*quant_info)[4]); + (*quant_info)[0] = max_qkv_input; + (*quant_info)[2] = max_qkv_input; + (*quant_info)[4] = max_qkv_input; + float max_qkv_output = std::max((*quant_info)[1], (*quant_info)[3]); + max_qkv_output = std::max(max_qkv_output, (*quant_info)[5]); + (*quant_info)[1] = max_qkv_output; + (*quant_info)[3] = max_qkv_output; + (*quant_info)[5] = max_qkv_output; + VLOG(3) << "max_qkv_input: " << max_qkv_input + << ", max_qkv_output: " << max_qkv_output; + + if (matmul_quant) { + auto* qkv_matmul_op_info = matched.at("qkv_matmul")->stmt()->op_info(); + CHECK(qkv_matmul_op_info->HasAttr("X0_scale") == true); + float softmax_out_threshold = matched.at("qk_softmax") + ->stmt() + ->op_info() + ->GetAttr("out_threshold"); + VLOG(3) << "qkv_matmul X max: " << softmax_out_threshold + << ", qkv_matmul Out max: " << (*quant_info)[6]; + CHECK_LT(std::abs(softmax_out_threshold - + qkv_matmul_op_info->GetAttr>( + "X0_scale")[0] * + 127), + 1e-5); + CHECK(qk_matmul_op_info->HasAttr("X0_scale") == true); + quant_info->push_back(max_qkv_output); + quant_info->push_back(max_qkv_output); + quant_info->push_back(softmax_out_threshold); + // qkv_matmul X max + quant_info->push_back(softmax_out_threshold); + quant_info->push_back(max_qkv_output); + quant_info->push_back((*quant_info)[6]); + CHECK_EQ(quant_info->size(), 18); + } + } }; class XPUMultiEncoderFuser { @@ -745,13 +793,13 @@ class XPUMultiEncoderFuser { Node* cur_encoder = all_encoders[i]; auto* op_info = cur_encoder->stmt()->op_info(); if (enable_int8) { - CHECK( - op_info->HasAttr("enable_int8") && op_info->HasAttr("Y0_max") && - op_info->HasAttr("X0_max") /* && op_info->HasAttr("Out0_max")*/); + CHECK(op_info->HasAttr("enable_int8")) << "no enable_int8 attr"; + CHECK(op_info->HasAttr("Y0_max")) << "no Y0_max attr"; + CHECK(op_info->HasAttr("fc_input_max")) << "no fc_input_max attr"; for (auto y0 : op_info->GetAttr>("Y0_max")) { fc_weight_max.push_back(y0); } - for (auto x0 : op_info->GetAttr>("X0_max")) { + for (auto x0 : op_info->GetAttr>("fc_input_max")) { fc_input_max.push_back(x0); } } @@ -802,8 +850,11 @@ class XPUMultiEncoderFuser { op_desc.SetAttr("enable_int8", enable_int8); if (enable_int8) { CHECK_EQ(fc_precision_, "int8"); - CHECK_EQ(fc_input_max.size(), all_encoders.size() * 6); CHECK_EQ(fc_weight_max.size(), all_encoders.size() * 6); + CHECK((fc_input_max.size() == all_encoders.size() * 12) || + (fc_input_max.size() == all_encoders.size() * 18)) + << fc_input_max.size() + << ", all_encoders.size:" << all_encoders.size(); for (int i = 0; i < fc_weight_max.size(); i += 6) { CHECK_LT(std::abs(fc_weight_max[i] - fc_weight_max[i + 1]), 1e-5) << " quanted ernie's q/k weight scale should be euqal: " diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc index dbe1601bfe5..8456babbdaf 100644 --- a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc +++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc @@ -50,6 +50,52 @@ std::vector* XPUMultiEncoderCompute::get_weight() { return &arg_fc_weight_fp32_; } +void XPUMultiEncoderCompute::prepare_quant_max( + const std::vector& max_value, + int n_layers, + int max_ptr_len, + std::vector& max_xpu_ptrs) { + bool mul_quant = false; + bool matmul_quant = false; + if (max_value.size() == (n_layers * 12)) { + mul_quant = true; + } else if (max_value.size() == (n_layers * 18)) { + mul_quant = true; + matmul_quant = true; + } else if (max_value.size() == 0) { + // dynamic quant, find max in xpu + return; + } else { + LOG(FATAL) << "invalid quant max value for xpu encoder, " + << max_value.size() << ", n_layers: " << n_layers; + } + // prepare input_max + input_max_guard_ = TargetWrapperXPU::MallocScratchPad( + max_value.size() * max_ptr_len * sizeof(float)); + float* input_max_ptr = reinterpret_cast(input_max_guard_->addr_); + std::vector cpu_max; + cpu_max.resize(max_value.size() * max_ptr_len); + for (int i = 0; i < max_value.size(); ++i) { + for (int j = 0; j < max_ptr_len; ++j) { + cpu_max[i * max_ptr_len + j] = max_value[i]; + } + } + lite::TargetWrapperXPU::MemcpySync( + input_max_ptr, + cpu_max.data(), + sizeof(float) * max_ptr_len * max_value.size(), + IoDirection::HtoD); + for (int i = 0; i < max_ptr_len * max_value.size(); i += max_ptr_len) { + max_xpu_ptrs.push_back(input_max_ptr + i); + } + if (matmul_quant) { + CHECK_EQ(max_xpu_ptrs.size(), (n_layers * 18)); + } else { + CHECK_EQ(max_xpu_ptrs.size(), (n_layers * 12)); + } + return; +} + void XPUMultiEncoderCompute::PrepareForRun() { auto& ctx = this->ctx_->As(); auto& param = this->Param(); @@ -88,24 +134,10 @@ void XPUMultiEncoderCompute::PrepareForRun() { IoDirection::HtoD); fc_weight_max_.push_back(cur_weight_max_ptr); } - if (param.input_max.size()) { - // prepare input_max - input_max_guard_ = TargetWrapperXPU::MallocScratchPad( - param.input_max.size() * XPU_QUANT_SCALE_NUM * sizeof(float)); - float* input_max_ptr = reinterpret_cast(input_max_guard_->addr_); - for (int i = 0; i < param.input_max.size(); i++) { - float* cur_input_max_ptr = input_max_ptr + i * XPU_QUANT_SCALE_NUM; - std::vector cpu_max(XPU_QUANT_SCALE_NUM, param.input_max[i]); - lite::TargetWrapperXPU::MemcpySync(cur_input_max_ptr, - cpu_max.data(), - sizeof(float) * XPU_QUANT_SCALE_NUM, - IoDirection::HtoD); - fc_input_max_.push_back(cur_input_max_ptr); - } - CHECK_EQ(fc_input_max_.size(), fc_weight_max_.size()) - << "input and weight max shape unequal:" << fc_input_max_.size() << "," - << fc_weight_max_.size(); - } + // prepare quant max, mul&matmul input/output max + const int n_layers = param.fc_weight.size() / 6; + prepare_quant_max( + param.input_max, n_layers, XPU_QUANT_SCALE_NUM, fc_input_max_); // prepare act_type if (param.act_type == "gelu") { qkv_act = xdnn::Activation_t::GELU; @@ -143,7 +175,9 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) { true /* qkv fusion */, max_pad_seqlen, param.hidden_dim); - + if (std::is_same::value) { + CHECK_GT(fc_input_max_.size(), 0); + } int r = xdnn::transformer_encoder( ctx.GetRawContext(), in, diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.h b/lite/kernels/xpu/__xpu__multi_encoder_compute.h index 7ab41dde856..a32a413ff74 100644 --- a/lite/kernels/xpu/__xpu__multi_encoder_compute.h +++ b/lite/kernels/xpu/__xpu__multi_encoder_compute.h @@ -52,6 +52,10 @@ class XPUMultiEncoderCompute template std::vector *get_weight(); + void prepare_quant_max(const std::vector &max_value, + int n_layers, + int max_ptr_len, + std::vector &max_xpu_ptrs); template void run_encoder(const T *in, T *out); }; From f39b3e71ba35b4c5dc2e5877cb848235ddb35c9c Mon Sep 17 00:00:00 2001 From: wangleilei001 Date: Tue, 17 May 2022 16:13:26 +0800 Subject: [PATCH 13/13] [XPU] update gemm/search api (#9023) --- lite/kernels/xpu/__xpu__conv2d_compute.cc | 2 +- lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc index 5399fe5182b..cad9a4fd691 100644 --- a/lite/kernels/xpu/__xpu__conv2d_compute.cc +++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc @@ -54,7 +54,7 @@ template void XPUConv2dCompute::PrepareForRun() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); - int max_ptr_size = xdnn::get_max_ptr_size(ctx.GetRawContext()); + int max_ptr_size = ctx.GetRawContext()->max_ptr_size(); param.output_max->Resize({max_ptr_size}); auto filter_ptr = param.filter->template data(); auto filter_dims = param.filter->dims(); diff --git a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc index 56fed1a8f9b..17864b7c069 100644 --- a/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc +++ b/lite/kernels/xpu/__xpu__dynamic_lstm_compute.cc @@ -27,7 +27,7 @@ namespace xpu { void XPUDynamicLstmCompute::PrepareForRun() { auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); - int max_ptr_size = xdnn::get_max_ptr_size(ctx.GetRawContext()); + int max_ptr_size = ctx.GetRawContext()->max_ptr_size(); // transpose from weight_0[xdim, 4 * hdim] to transpose_weight_0[4 * hdim, // xdim]