Scs trans match v2.9 (#7368)

* [xpu] support qkv-fused weight reuse in scs_tran_match (#7293) * [xpu] fix qkv-fused bias check * [Cherry-pick][X86][ARM] scale support int, int64 (#6590)
PaddlePaddle · Oct 22, 2021 · 92eb5d7 · 92eb5d7
1 parent 6c5af8a
commit 92eb5d7
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 69 deletions.
diff --git a/lite/kernels/x86/scale_compute.cc b/lite/kernels/x86/scale_compute.cc
@@ -20,6 +20,26 @@ REGISTER_LITE_KERNEL(scale,
                      kNCHW,
                      paddle::lite::kernels::x86::ScaleCompute<float>,
                      def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(scale,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::ScaleCompute<int>,
+                     int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(scale,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::ScaleCompute<int64_t>,
+                     int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .Finalize();
diff --git a/lite/kernels/x86/scale_compute.h b/lite/kernels/x86/scale_compute.h
@@ -27,7 +27,7 @@ namespace x86 {
 
 template <typename T>
 void scale_compute(
-    const T* x, T* out, int size, float scale, float bias, bool bias_before) {
+    const T* x, T* out, int size, T scale, T bias, bool bias_before) {
   if (bias_before) bias *= scale;
   for (int i = 0; i < size; i++) {
     out[i] = x[i] * scale + bias;
@@ -41,11 +41,13 @@ class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
+    T scale = static_cast<T>(param.scale);
+    T bias = static_cast<T>(param.bias);
     scale_compute(param.x->template data<T>(),
                   param.output->template mutable_data<T>(),
                   param.x->dims().production(),
-                  param.scale,
-                  param.bias,
+                  scale,
+                  bias,
                   !param.bias_after_scale);
   }
 

diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace lite {
 
+template <typename T>
 class ScaleComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
@@ -31,7 +32,6 @@ class ScaleComputeTester : public arena::TestCase {
   float bias_ = 0.;
   bool bias_after_scale_ = true;
   bool have_relu6 = false;
-  PrecisionType x_dtype_ = PRECISION(kFloat);
   std::string act_type_ = "relu6";
   float alpha_ = 6.0f;
 
@@ -42,18 +42,15 @@ class ScaleComputeTester : public arena::TestCase {
                      float scale,
                      float bias,
                      bool bias_after_scale = true,
-                     bool have_relu6 = false,
-                     PrecisionType x_dtype = PRECISION(kFloat))
+                     bool have_relu6 = false)
       : TestCase(place, alias),
         x_dims_(x_dims),
         scale_(scale),
         bias_(bias),
         bias_after_scale_(bias_after_scale),
-        have_relu6(have_relu6),
-        x_dtype_(x_dtype) {}
+        have_relu6(have_relu6) {}
 
-  template <typename T>
-  void RunBaselineHelper(Scope* scope) {
+  void RunBaseline(Scope* scope) override {
     auto* x = scope->FindTensor(x_);
     auto* x_data = x->template data<T>();
     auto* out = scope->NewTensor(out_);
@@ -77,20 +74,6 @@ class ScaleComputeTester : public arena::TestCase {
     }
   }
 
-  void RunBaseline(Scope* scope) override {
-    switch (x_dtype_) {
-      case PRECISION(kFloat):
-        RunBaselineHelper<float>(scope);
-        break;
-      case PRECISION(kInt32):
-        RunBaselineHelper<int>(scope);
-        break;
-      default:
-        LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
-        break;
-    }
-  }
-
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("scale");
     op_desc->SetInput("X", {x_});
@@ -104,33 +87,27 @@ class ScaleComputeTester : public arena::TestCase {
     }
   }
 
-  template <typename T>
   void PrepareDataHelper() {
     std::vector<T> dx(x_dims_.production());
     fill_data_rand<T>(dx.data(), -10, 10, x_dims_.production());
     SetCommonTensor(x_, x_dims_, dx.data());
   }
 
   void PrepareData() override {
-    switch (x_dtype_) {
-      case PRECISION(kFloat):
-        PrepareDataHelper<float>();
-        break;
-      case PRECISION(kInt32):
-        PrepareDataHelper<int>();
-        break;
-      default:
-        LOG(FATAL) << "unsupported data type: " << PrecisionToStr(x_dtype_);
-        break;
-    }
+    std::vector<T> dx(x_dims_.production());
+    fill_data_rand<T>(dx.data(),
+                      static_cast<T>(-10),
+                      static_cast<T>(10),
+                      x_dims_.production());
+    SetCommonTensor(x_, x_dims_, dx.data());
   }
 };
 
 void TestScaleShape(Place place, float abs_error) {
   for (auto x_dims :
        std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
-    std::unique_ptr<arena::TestCase> tester(
-        new ScaleComputeTester(place, "def", DDim(x_dims), 1.5f, 0.2f, true));
+    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester<float>(
+        place, "def", DDim(x_dims), 1.5f, 0.2f, true));
     arena::Arena arena(std::move(tester), place, abs_error);
     arena.TestPrecision();
   }
@@ -139,7 +116,7 @@ void TestScaleShape(Place place, float abs_error) {
 void TestScaleValue(Place place, float abs_error) {
   for (float scale : {0.123, 0., -1.2}) {
     for (float bias : {1., 0., -1.2331}) {
-      std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+      std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester<float>(
           place, "def", DDim({5, 2, 3, 4}), scale, bias));
       arena::Arena arena(std::move(tester), place, abs_error);
       arena.TestPrecision();
@@ -149,27 +126,19 @@ void TestScaleValue(Place place, float abs_error) {
 
 void TestScaleOrder(Place place, float abs_error) {
   for (bool bias_after_scale : {true, false}) {
-    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester<float>(
         place, "def", DDim({2, 3, 4, 5}), 1.5f, 0.2f, bias_after_scale));
     arena::Arena arena(std::move(tester), place, abs_error);
     arena.TestPrecision();
   }
 }
 
-void TestScaleDtype(Place place, float abs_error) {
-  for (PrecisionType x_dtype : {PRECISION(kFloat), PRECISION(kInt32)}) {
-    if (x_dtype == PRECISION(kFloat)) {
-      place.precision = PRECISION(kFloat);
-    } else if (x_dtype == PRECISION(kInt32)) {
-      place.precision = PRECISION(kInt32);
-    } else {
-      LOG(FATAL) << "fatal";
-    }
-    std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
-        place, "def", DDim({2, 3, 4, 5}), 2.f, 1.f, true, false, x_dtype));
-    arena::Arena arena(std::move(tester), place, abs_error);
-    arena.TestPrecision();
-  }
+template <typename T>
+void TestScaleDtype(Place place, float abs_error, std::string alias) {
+  std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester<T>(
+      place, alias, DDim({2, 3, 4, 5}), 2.f, 1.f, true, false));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
 }
 
 void TestScaleRelu6(Place place, float abs_error) {
@@ -178,13 +147,13 @@ void TestScaleRelu6(Place place, float abs_error) {
     for (bool bias_after_scale : {true, false}) {
       for (bool have_relu6 : {true, false}) {
         std::unique_ptr<arena::TestCase> tester(
-            new ScaleComputeTester(place,
-                                   "def",
-                                   DDim(x_dims),
-                                   1.5f,
-                                   0.2f,
-                                   bias_after_scale,
-                                   have_relu6));
+            new ScaleComputeTester<float>(place,
+                                          "def",
+                                          DDim(x_dims),
+                                          1.5f,
+                                          0.2f,
+                                          bias_after_scale,
+                                          have_relu6));
         arena::Arena arena(std::move(tester), place, abs_error);
         arena.TestPrecision();
       }
@@ -203,7 +172,7 @@ TEST(Scale, precision) {
   abs_error = 5e-2;  // Using fp16 in OPENCL
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
+#elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
   abs_error = 3e-4;  // Some operations use fp16 in XPU
 #elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
@@ -221,10 +190,6 @@ TEST(Scale, precision) {
 #if defined(LITE_WITH_OPENCL)
   TestScaleRelu6(place, abs_error);
 #endif
-#if defined(LITE_WITH_ARM) && !defined(LITE_WITH_NPU) && \
-    !defined(LITE_WITH_OPENCL)
-  TestScaleDtype(place, abs_error);
-#endif
 }
 
 TEST(Scale, performance) {
@@ -237,7 +202,7 @@ TEST(Scale, performance) {
   return;
 #endif
 
-  std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+  std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester<float>(
       place, "def", DDim(std::vector<int64_t>{5, 2, 3, 4}), 1.2, 1.1, true));
 
   // To modify the arm context, one can retrive the context as follows.
@@ -249,5 +214,20 @@ TEST(Scale, performance) {
   arena.TestPerformance(100);
 }
 
+TEST(Scale, dtype) {
+  Place place;
+  float abs_error = 1e-4;
+#if defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kX86);
+#else
+  return;
+#endif
+
+  TestScaleDtype<int>(place, abs_error, "int32");
+  TestScaleDtype<int64_t>(place, abs_error, "int64");
+}
+
 }  // namespace lite
 }  // namespace paddle