[LITE][NPU] Add layer_norm op bridge (#2767)

PaddlePaddle · Jan 15, 2020 · 2ac5fe3 · 2ac5fe3
1 parent c0af965
commit 2ac5fe3
Show file tree

Hide file tree

Showing 7 changed files with 183 additions and 10 deletions.
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -42,6 +42,7 @@ lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu
 lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_layer_norm_op_npu SRCS layer_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 
 set(npu_subgraph_bridges
         subgraph_bridge_registry
@@ -71,6 +72,7 @@ set(npu_subgraph_bridges
         subgraph_bridge_argmax_op_npu
         subgraph_bridge_instance_norm_op_npu
         subgraph_bridge_dropout_op_npu
+        subgraph_bridge_layer_norm_op_npu
         CACHE INTERNAL "npu_subgraph_bridges")
 
 message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
diff --git a/lite/kernels/npu/bridges/instance_norm_op.cc b/lite/kernels/npu/bridges/instance_norm_op.cc
@@ -82,7 +82,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     } else {
       if (!bias->persistable()) {
         LOG(WARNING) << "[NPU] Only supporting persistable bias tensor.";
-        bias->set_persistable(true);
+        return FAILED;
       }
       bias_node = graph->Add(bias_name, *bias, scale_bias_dims);
     }
@@ -108,7 +108,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     CHECK_EQ(channel_size, scale_dims.production());
     if (!scale->persistable()) {
       LOG(WARNING) << "[NPU] Only supporting persistable scale tensor.";
-      scale->set_persistable(true);
+      return FAILED;
     }
     scale_node = graph->Add(scale_name, *scale, scale_bias_dims);
   } else {
@@ -121,8 +121,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   instance_norm_op->set_input_x(*x_node->data());
   instance_norm_op->set_input_scale(*scale_node->data());
   instance_norm_op->set_input_bias(*bias_node->data());
-  instance_norm_op->set_attr_reduction_indices(
-      ge::AttrValue::LIST_INT({0, 1, 2}));
+  instance_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({2}));
   instance_norm_op->set_attr_epsilon(epsilon);
   return SUCCESS;
 }

diff --git a/lite/kernels/npu/bridges/layer_norm_op.cc b/lite/kernels/npu/bridges/layer_norm_op.cc
@@ -0,0 +1,168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x_type = kernel->GetInputDeclType("X");
+  CHECK(x_type->precision() == PRECISION(kFloat));
+  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto padded_x_shape = CvtShape(x_dims);
+  auto x_rank = static_cast<int>(x_dims.size());
+  CHECK(x_rank >= 2 && x_rank <= 4);
+
+  auto y_name = op_info->Output("Y").front();
+  auto y_type = kernel->GetOutputDeclType("Y");
+  CHECK(y_type->precision() == PRECISION(kFloat));
+  CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto padded_y_shape = CvtShape(y_dims);
+
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto begin_norm_axis = op_info->GetAttr<int>("begin_norm_axis");
+  if (begin_norm_axis < 0) {
+    begin_norm_axis += x_rank;
+  }
+  CHECK(begin_norm_axis >= 1 && begin_norm_axis < x_rank);
+  auto x_mat_dims = x_dims.Flatten2D(begin_norm_axis);
+  auto left = x_mat_dims[0];
+  auto right = x_mat_dims[1];
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x, padded_x_shape);
+  }
+
+  // Reshaped X node if needs
+  bool reshape = false;
+  if (!(x_rank == 4 && begin_norm_axis == 1)) {
+    reshape = true;
+    // Only the input shape 4-D(n, c, h, w) and axis=1 is supported
+    // by HiAI DDK, So the input shape need to be padded to 4-D if it is less
+    // than 4 or axis!=1. For example:
+    // (1) (n, c, h, w), axis=1 -> no need
+    // (2) (n, c, h, w), axis=2 -> (n * c, h, w, 1)
+    // (3) (n, c, h, w), axis=3 -> (n * c * h, w, 1)
+    // (4) (n, h, w), axis=1 -> (n, h, w, 1)
+    // (5) (n, h, w), axis=2 -> (n * h, w, 1, 1)
+    // (6) (h, w), axis=1 -> (h, w, 1, 1)
+    padded_x_shape = {left};
+    for (int i = begin_norm_axis; i < x_rank; i++) {
+      padded_x_shape.push_back(x_dims[i]);
+    }
+    auto remain = 4 - padded_x_shape.size();
+    for (int i = 0; i < remain; i++) {
+      padded_x_shape.push_back(1);
+    }
+    auto reshaped_x_node = graph->Add<ge::op::Reshape>(
+        x_name + "/reshape", x_node->precision(), x_node->layout());
+    auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+    reshaped_x_op->set_input_tensor(*x_node->data());
+    reshaped_x_op->set_attr_shape(padded_x_shape);
+    x_node = reshaped_x_node;
+  }
+
+  // Bias node
+  auto scale_bias_dims =
+      DDim({1, padded_x_shape[1], padded_x_shape[2], padded_x_shape[3]});
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias_type = kernel->GetInputDeclType("Bias");
+    CHECK(bias_type->precision() == PRECISION(kFloat));
+    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    CHECK_EQ(bias_dims.size(), 1);
+    CHECK_EQ(bias_dims.production(), right);
+    if (!bias->persistable()) {
+      LOG(WARNING) << "[NPU] Only supporting persistable bias tensor.";
+      return FAILED;
+    }
+    bias_node = graph->Add(bias_name, *bias, scale_bias_dims);
+  } else {
+    bias_node = graph->Add(y_name + "/bias", 0.0f, scale_bias_dims);
+  }
+
+  // Scale node
+  std::shared_ptr<Node> scale_node = nullptr;
+  if (HasInputArg(op_info, scope, "Scale")) {
+    auto scale_name = op_info->Input("Scale").front();
+    auto scale_type = kernel->GetInputDeclType("Scale");
+    CHECK(scale_type->precision() == PRECISION(kFloat));
+    CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
+    auto scale = scope->FindMutableTensor(scale_name);
+    auto scale_dims = scale->dims();
+    CHECK_EQ(scale_dims.size(), 1);
+    CHECK_EQ(scale_dims.production(), right);
+    if (!scale->persistable()) {
+      LOG(WARNING) << "[NPU] Only supporting persistable scale tensor.";
+      return FAILED;
+    }
+    scale_node = graph->Add(scale_name, *scale, scale_bias_dims);
+  } else {
+    scale_node = graph->Add(y_name + "/scale", 1.0f, scale_bias_dims);
+  }
+
+  // LayerNorm node
+  auto layer_norm_node = graph->Add<ge::op::InstanceNorm>(y_name);
+  auto layer_norm_op = layer_norm_node->data<ge::op::InstanceNorm>();
+  layer_norm_op->set_input_x(*x_node->data());
+  layer_norm_op->set_input_scale(*scale_node->data());
+  layer_norm_op->set_input_bias(*bias_node->data());
+  layer_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({3}));
+  layer_norm_op->set_attr_epsilon(epsilon);
+
+  // Reshaped Y node if needs
+  if (reshape) {
+    auto reshaped_y_node = graph->Add<ge::op::Reshape>(
+        y_name, layer_norm_node->precision(), layer_norm_node->layout());
+    auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+    reshaped_y_op->set_input_tensor(*layer_norm_node->data());
+    reshaped_y_op->set_attr_shape(padded_y_shape);
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(layer_norm,
+                         kNPU,
+                         paddle::lite::subgraph::npu::LayerNormConverter);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -55,3 +55,4 @@ USE_SUBGRAPH_BRIDGE(transpose2, kNPU);
 USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU);
 USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU);
 USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
+USE_SUBGRAPH_BRIDGE(layer_norm, kNPU);
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
@@ -26,7 +26,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
     lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc
@@ -122,8 +122,8 @@ class InstanceNormComputeTest : public arena::TestCase {
     fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_dims.production());
 
     SetCommonTensor(x_, dims_, x.data());
-    SetCommonTensor(scale_, scale_bias_dims, scale.data());
-    SetCommonTensor(bias_, scale_bias_dims, bias.data());
+    SetCommonTensor(scale_, scale_bias_dims, scale.data(), {}, true);
+    SetCommonTensor(bias_, scale_bias_dims, bias.data(), {}, true);
   }
 };
 

diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc
@@ -132,13 +132,13 @@ class LayerNormComputeTest : public arena::TestCase {
       DDim scale_dims({scale_bias_size});
       std::vector<float> scale(scale_bias_size);
       fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_size);
-      SetCommonTensor(scale_, scale_dims, scale.data());
+      SetCommonTensor(scale_, scale_dims, scale.data(), {}, true);
     }
     if (has_bias_) {
       DDim bias_dims({scale_bias_size});
       std::vector<float> bias(scale_bias_size);
       fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_size);
-      SetCommonTensor(bias_, bias_dims, bias.data());
+      SetCommonTensor(bias_, bias_dims, bias.data(), {}, true);
     }
   }
 };
@@ -149,6 +149,9 @@ TEST(LayerNorm, precision) {
   Place place;
 #if defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
+#elif defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
   abs_error = 6e-5;
@@ -157,7 +160,7 @@ TEST(LayerNorm, precision) {
 #endif
 
   for (auto dims :
-       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) {
+       std::vector<std::vector<int64_t>>{{2, 3, 4, 5}, {3, 4, 5}, {4, 5}}) {
     for (auto epsilon : {1e-5f}) {
       for (auto axis : {1, 2, 3}) {
         for (bool has_bias : {true, false}) {