PaddlePaddle · zhupengyang · Oct 11, 2023 · Sep 8, 2023 · Sep 19, 2023
@@ -114,6 +114,7 @@ USE_MIR_PASS(__xpu__multi_softmax_fuse_pass);
 USE_MIR_PASS(__xpu__max_pooling_pad_zero_detect_fuse_pass);
 USE_MIR_PASS(__xpu__static_kernel_pick_pass);
 USE_MIR_PASS(__xpu__quantization_parameters_propagation_pass);
+USE_MIR_PASS(__xpu__greater_than_cast_mul_fuse_pass);
 USE_MIR_PASS(x86_int8_attribute_pass);
 USE_MIR_PASS(fill_range_fuse_pass);
 USE_MIR_PASS(range_calc_offline_pass);

@@ -0,0 +1,155 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/optimizer/mir/pass_registry.h"
+#include "lite/core/optimizer/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+/*-----------------------------------------------------*/
+/* support xpu greater_than_cast_mul_fuse_pass         */
+/*                in_Input --------------              */
+/*                    |         |       |              */
+/*                    |         |       |              */
+/*              fill_any_like   |       |              */
+/*                    |         |       |              */
+/*                    |         |       |              */
+/*                  scale      /       /               */
+/*                    |       /       /                */
+/*                    |      /       /                 */
+/*              greater_than        /                  */
+/*                    |            /                   */
+/*                    |           /                    */
+/*                  cast         /                     */
+/*                    |         /                      */
+/*                    |        /                       */
+/*              elementwise_mul                        */
+/*                    |                                */
+/*                    |                                */
+/*                out_Output                           */
+/*-----------------------------------------------------*/
+
+/*-----------------------------------------------------*/
+/* After the pass apply:                               */
+/*                in_Input                             */
+/*                    |                                */
+/*                    |                                */
+/*          xpu_greater_than_filter                    */
+/*                    |                                */
+/*                    |                                */
+/*                out_Output                           */
+/*-----------------------------------------------------*/
+
+class XPUGreaterThanCastMulFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    PMNode* input = nullptr;
+    PMNode* fill_any_like = nullptr;
+    PMNode* greater_than = nullptr;
+    PMNode* elementwise_mul = nullptr;
+    PMNode* fill_any_like_out = nullptr;
+    PMNode* scale = nullptr;
+    PMNode* scale_out = nullptr;
+    PMNode* greater_than_out = nullptr;
+    PMNode* cast = nullptr;
+    PMNode* cast_out = nullptr;
+    PMNode* elementwise_mul_out = nullptr;
+    input = VarNode("input")
+                ->assert_is_op_input("fill_any_like", "X")
+                ->assert_is_op_input("greater_than", "X")
+                ->assert_is_op_input("elementwise_mul", "X")
+                ->AsInput();
+    fill_any_like = OpNode("fill_any_like", "fill_any_like")->AsIntermediate();
+    fill_any_like_out = VarNode("fill_any_like_out")
+                            ->assert_is_op_output("fill_any_like", "Out")
+                            ->assert_is_op_input("scale", "X")
+                            ->AsIntermediate();
+    scale = OpNode("scale", "scale")->AsIntermediate();
+    scale_out = VarNode("scale_out")
+                    ->assert_is_op_output("scale", "Out")
+                    ->assert_is_op_input("greater_than", "Y")
+                    ->AsIntermediate();
+    greater_than = OpNode("greater_than", "greater_than")->AsIntermediate();
+    greater_than_out = VarNode("greater_than_out")
+                           ->assert_is_op_output("greater_than", "Out")
+                           ->assert_is_op_input("cast", "X")
+                           ->AsIntermediate();
+    cast = OpNode("cast", "cast")->AsIntermediate();
+    cast_out = VarNode("cast_out")
+                   ->assert_is_op_output("cast", "Out")
+                   ->assert_is_op_input("elementwise_mul", "Y")
+                   ->AsIntermediate();
+    elementwise_mul =
+        OpNode("elementwise_mul", "elementwise_mul")->AsIntermediate();
+    elementwise_mul_out = VarNode("elementwise_mul_out")
+                              ->assert_is_op_output("elementwise_mul", "Out")
+                              ->AsOutput();
+    *input >> *fill_any_like >> *fill_any_like_out >> *scale >> *scale_out >>
+        *greater_than >> *greater_than_out >> *cast >> *cast_out >>
+        *elementwise_mul >> *elementwise_mul_out;
+    *input >> *greater_than;
+    *input >> *elementwise_mul;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__greater_than_filter");
+    op_desc.SetInput("X", {matched.at("input")->arg()->name});
+    op_desc.SetOutput("Out", {matched.at("elementwise_mul_out")->arg()->name});
+
+    auto* scope = matched.at("fill_any_like")->stmt()->op()->scope();
+    auto input_name = matched.at("input")->arg()->name;
+
+    float value_filled = 1.0f;
+    value_filled =
+        matched.at("fill_any_like")->stmt()->op_info()->GetAttr<float>("value");
+    float scale_val = 0.f;
+    scale_val = matched.at("scale")->stmt()->op_info()->GetAttr<float>("scale");
+    scale_val *= value_filled;
+    op_desc.SetAttr<float>("scale", scale_val);
+    auto& valid_places =
+        matched.at("fill_any_like")->stmt()->op()->valid_places();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, scope);
+    auto* new_op_node = graph->GraphCreateInstructNode(new_op, valid_places);
+
+    DirectedLink(matched.at("input"), new_op_node);
+    DirectedLink(new_op_node, matched.at("elementwise_mul_out"));
+  }
+};
+
+}  // namespace fusion
+
+class XPUGreaterThanCastMulFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    fusion::XPUGreaterThanCastMulFuser fuser;
+    fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__greater_than_cast_mul_fuse_pass,
+                  paddle::lite::mir::XPUGreaterThanCastMulFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__greater_than_filter");
@@ -211,6 +211,7 @@ std::unique_ptr<RuntimeProgram> RunDefaultOptimizer(
        "__xpu__mmdnn_fuse_pass",
        "__xpu__bigru_fuse_pass",
        "__xpu__roformer_relative_pos_fuse_pass",
+       "__xpu__greater_than_cast_mul_fuse_pass",
        "__xpu__multihead_self_attn_fuse_pass",
        "__xpu__multihead_cross_attn_fuse_pass",
        "__xpu__geglu_fuse_pass",

@@ -137,6 +137,7 @@ add_kernel(__xpu__squeeze_excitation_compute_xpu XPU extra SRCS __xpu__squeeze_e
 add_kernel(__xpu__bigru_compute_xpu XPU extra SRCS __xpu__bigru_compute.cc)
 add_kernel(__xpu__dynamic_lstm_compute_xpu XPU extra SRCS __xpu__dynamic_lstm_compute.cc)
 add_kernel(__xpu__multi_softmax_compute_xpu XPU extra SRCS __xpu__multi_softmax_compute.cc)
+add_kernel(__xpu__greater_than_filter_compute_xpu XPU extra SRCS __xpu__greater_than_filter_compute.cc)
 
 add_kernel(__xpu__gn_silu_compute_xpu XPU extra SRCS __xpu__gn_silu_compute.cc)
 add_kernel(__xpu__multihead_self_attn_compute_xpu XPU extra SRCS __xpu__multihead_self_attn_compute.cc)

@@ -0,0 +1,54 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__greater_than_filter_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void GreaterThanFilterCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto input_dim = param.input->dims();
+  CHECK_EQ(input_dim.size(), 2);
+  int tensor_len = input_dim[0];
+
+  int r = xdnn::greater_filter_fusion<float>(
+      ctx.GetRawContext(),
+      param.input->data<float>(),
+      param.output->mutable_data<float>(TARGET(kXPU)),
+      param.scale,
+      tensor_len);
+  CHECK_EQ(r, 0) << "call GreaterThanFilterCompute failed";
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__greater_than_filter,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::GreaterThanFilterCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class GreaterThanFilterCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUGreaterThanFilterParam;
+
+  virtual void Run();
+
+  virtual ~GreaterThanFilterCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
@@ -262,6 +262,7 @@ add_operator(__xpu__gn_silu_op extra SRCS __xpu__gn_silu_op.cc)
 add_operator(__xpu__multihead_self_attn_op extra SRCS __xpu__multihead_self_attn_op.cc)
 add_operator(__xpu__multihead_cross_attn_op extra SRCS __xpu__multihead_cross_attn_op.cc)
 add_operator(__xpu__geglu_op extra SRCS __xpu__geglu_op.cc)
+add_operator(__xpu__greater_than_filter_op extra SRCS __xpu__greater_than_filter_op.cc)
 
 if(XPU_WITH_XFT)
     add_operator(fusion_decoding_op extra SRCS fusion_decoding_op.cc)

@@ -0,0 +1,64 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__greater_than_filter_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUGreaterThanFilterOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+
+  const auto input_dims = param_.input->dims();
+  CHECK_EQ_OR_FALSE(input_dims.size(), 2UL);
+  return true;
+}
+
+bool XPUGreaterThanFilterOp::InferShapeImpl() const {
+  const auto& input_dims = param_.input->dims();
+  param_.output->Resize(input_dims);
+  // share LoD
+  param_.output->set_lod(param_.input->lod());
+
+  return true;
+}
+
+bool XPUGreaterThanFilterOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                        lite::Scope* scope) {
+  CHECK(scope->FindVar(op_desc.Input("X").front()));
+  CHECK(scope->FindVar(op_desc.Output("Out").front()));
+
+  param_.input =
+      scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.output =
+      scope->FindVar(op_desc.Output("Out").front())->GetMutable<Tensor>();
+  param_.scale = op_desc.GetAttr<float>("scale");
+
+  CHECK(param_.input);
+  CHECK(param_.output);
+  CHECK(param_.scale);
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__greater_than_filter,
+                 paddle::lite::operators::XPUGreaterThanFilterOp);
@@ -0,0 +1,47 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUGreaterThanFilterOp : public OpLite {
+ public:
+  XPUGreaterThanFilterOp() {}
+
+  explicit XPUGreaterThanFilterOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUGreaterThanFilterOp"; }
+
+ private:
+  mutable XPUGreaterThanFilterParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle