Skip to content

Commit

Permalink
[LITE][NPU] Add layer_norm op bridge (#2767)
Browse files Browse the repository at this point in the history
  • Loading branch information
hong19860320 authored Jan 15, 2020
1 parent c0af965 commit 2ac5fe3
Show file tree
Hide file tree
Showing 7 changed files with 183 additions and 10 deletions.
2 changes: 2 additions & 0 deletions lite/kernels/npu/bridges/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu
lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_layer_norm_op_npu SRCS layer_norm_op.cc DEPS ${npu_subgraph_bridge_deps})

set(npu_subgraph_bridges
subgraph_bridge_registry
Expand Down Expand Up @@ -71,6 +72,7 @@ set(npu_subgraph_bridges
subgraph_bridge_argmax_op_npu
subgraph_bridge_instance_norm_op_npu
subgraph_bridge_dropout_op_npu
subgraph_bridge_layer_norm_op_npu
CACHE INTERNAL "npu_subgraph_bridges")

message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
7 changes: 3 additions & 4 deletions lite/kernels/npu/bridges/instance_norm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
} else {
if (!bias->persistable()) {
LOG(WARNING) << "[NPU] Only supporting persistable bias tensor.";
bias->set_persistable(true);
return FAILED;
}
bias_node = graph->Add(bias_name, *bias, scale_bias_dims);
}
Expand All @@ -108,7 +108,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK_EQ(channel_size, scale_dims.production());
if (!scale->persistable()) {
LOG(WARNING) << "[NPU] Only supporting persistable scale tensor.";
scale->set_persistable(true);
return FAILED;
}
scale_node = graph->Add(scale_name, *scale, scale_bias_dims);
} else {
Expand All @@ -121,8 +121,7 @@ int InstanceNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
instance_norm_op->set_input_x(*x_node->data());
instance_norm_op->set_input_scale(*scale_node->data());
instance_norm_op->set_input_bias(*bias_node->data());
instance_norm_op->set_attr_reduction_indices(
ge::AttrValue::LIST_INT({0, 1, 2}));
instance_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({2}));
instance_norm_op->set_attr_epsilon(epsilon);
return SUCCESS;
}
Expand Down
168 changes: 168 additions & 0 deletions lite/kernels/npu/bridges/layer_norm_op.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utility.h"

namespace paddle {
namespace lite {
namespace subgraph {
namespace npu {

int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[NPU] Converting " + op_type + "...";

// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x_type = kernel->GetInputDeclType("X");
CHECK(x_type->precision() == PRECISION(kFloat));
CHECK(x_type->layout() == DATALAYOUT(kNCHW));
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto padded_x_shape = CvtShape(x_dims);
auto x_rank = static_cast<int>(x_dims.size());
CHECK(x_rank >= 2 && x_rank <= 4);

auto y_name = op_info->Output("Y").front();
auto y_type = kernel->GetOutputDeclType("Y");
CHECK(y_type->precision() == PRECISION(kFloat));
CHECK(y_type->layout() == DATALAYOUT(kNCHW));
auto y = scope->FindMutableTensor(y_name);
auto y_dims = y->dims();
auto padded_y_shape = CvtShape(y_dims);

auto epsilon = op_info->GetAttr<float>("epsilon");
auto begin_norm_axis = op_info->GetAttr<int>("begin_norm_axis");
if (begin_norm_axis < 0) {
begin_norm_axis += x_rank;
}
CHECK(begin_norm_axis >= 1 && begin_norm_axis < x_rank);
auto x_mat_dims = x_dims.Flatten2D(begin_norm_axis);
auto left = x_mat_dims[0];
auto right = x_mat_dims[1];

// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x, padded_x_shape);
}

// Reshaped X node if needs
bool reshape = false;
if (!(x_rank == 4 && begin_norm_axis == 1)) {
reshape = true;
// Only the input shape 4-D(n, c, h, w) and axis=1 is supported
// by HiAI DDK, So the input shape need to be padded to 4-D if it is less
// than 4 or axis!=1. For example:
// (1) (n, c, h, w), axis=1 -> no need
// (2) (n, c, h, w), axis=2 -> (n * c, h, w, 1)
// (3) (n, c, h, w), axis=3 -> (n * c * h, w, 1)
// (4) (n, h, w), axis=1 -> (n, h, w, 1)
// (5) (n, h, w), axis=2 -> (n * h, w, 1, 1)
// (6) (h, w), axis=1 -> (h, w, 1, 1)
padded_x_shape = {left};
for (int i = begin_norm_axis; i < x_rank; i++) {
padded_x_shape.push_back(x_dims[i]);
}
auto remain = 4 - padded_x_shape.size();
for (int i = 0; i < remain; i++) {
padded_x_shape.push_back(1);
}
auto reshaped_x_node = graph->Add<ge::op::Reshape>(
x_name + "/reshape", x_node->precision(), x_node->layout());
auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
reshaped_x_op->set_input_tensor(*x_node->data());
reshaped_x_op->set_attr_shape(padded_x_shape);
x_node = reshaped_x_node;
}

// Bias node
auto scale_bias_dims =
DDim({1, padded_x_shape[1], padded_x_shape[2], padded_x_shape[3]});
std::shared_ptr<Node> bias_node = nullptr;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
auto bias_type = kernel->GetInputDeclType("Bias");
CHECK(bias_type->precision() == PRECISION(kFloat));
CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
CHECK_EQ(bias_dims.size(), 1);
CHECK_EQ(bias_dims.production(), right);
if (!bias->persistable()) {
LOG(WARNING) << "[NPU] Only supporting persistable bias tensor.";
return FAILED;
}
bias_node = graph->Add(bias_name, *bias, scale_bias_dims);
} else {
bias_node = graph->Add(y_name + "/bias", 0.0f, scale_bias_dims);
}

// Scale node
std::shared_ptr<Node> scale_node = nullptr;
if (HasInputArg(op_info, scope, "Scale")) {
auto scale_name = op_info->Input("Scale").front();
auto scale_type = kernel->GetInputDeclType("Scale");
CHECK(scale_type->precision() == PRECISION(kFloat));
CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
auto scale = scope->FindMutableTensor(scale_name);
auto scale_dims = scale->dims();
CHECK_EQ(scale_dims.size(), 1);
CHECK_EQ(scale_dims.production(), right);
if (!scale->persistable()) {
LOG(WARNING) << "[NPU] Only supporting persistable scale tensor.";
return FAILED;
}
scale_node = graph->Add(scale_name, *scale, scale_bias_dims);
} else {
scale_node = graph->Add(y_name + "/scale", 1.0f, scale_bias_dims);
}

// LayerNorm node
auto layer_norm_node = graph->Add<ge::op::InstanceNorm>(y_name);
auto layer_norm_op = layer_norm_node->data<ge::op::InstanceNorm>();
layer_norm_op->set_input_x(*x_node->data());
layer_norm_op->set_input_scale(*scale_node->data());
layer_norm_op->set_input_bias(*bias_node->data());
layer_norm_op->set_attr_reduction_indices(ge::AttrValue::LIST_INT({3}));
layer_norm_op->set_attr_epsilon(epsilon);

// Reshaped Y node if needs
if (reshape) {
auto reshaped_y_node = graph->Add<ge::op::Reshape>(
y_name, layer_norm_node->precision(), layer_norm_node->layout());
auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
reshaped_y_op->set_input_tensor(*layer_norm_node->data());
reshaped_y_op->set_attr_shape(padded_y_shape);
}
return REBUILD_WHEN_SHAPE_CHANGED;
}

} // namespace npu
} // namespace subgraph
} // namespace lite
} // namespace paddle

REGISTER_SUBGRAPH_BRIDGE(layer_norm,
kNPU,
paddle::lite::subgraph::npu::LayerNormConverter);
1 change: 1 addition & 0 deletions lite/kernels/npu/bridges/paddle_use_bridges.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,4 @@ USE_SUBGRAPH_BRIDGE(transpose2, kNPU);
USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU);
USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU);
USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
USE_SUBGRAPH_BRIDGE(layer_norm, kNPU);
2 changes: 1 addition & 1 deletion lite/tests/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
Expand Down
4 changes: 2 additions & 2 deletions lite/tests/kernels/instance_norm_compute_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ class InstanceNormComputeTest : public arena::TestCase {
fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_dims.production());

SetCommonTensor(x_, dims_, x.data());
SetCommonTensor(scale_, scale_bias_dims, scale.data());
SetCommonTensor(bias_, scale_bias_dims, bias.data());
SetCommonTensor(scale_, scale_bias_dims, scale.data(), {}, true);
SetCommonTensor(bias_, scale_bias_dims, bias.data(), {}, true);
}
};

Expand Down
9 changes: 6 additions & 3 deletions lite/tests/kernels/layer_norm_compute_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,13 @@ class LayerNormComputeTest : public arena::TestCase {
DDim scale_dims({scale_bias_size});
std::vector<float> scale(scale_bias_size);
fill_data_rand(scale.data(), -1.f, 1.f, scale_bias_size);
SetCommonTensor(scale_, scale_dims, scale.data());
SetCommonTensor(scale_, scale_dims, scale.data(), {}, true);
}
if (has_bias_) {
DDim bias_dims({scale_bias_size});
std::vector<float> bias(scale_bias_size);
fill_data_rand(bias.data(), -1.f, 1.f, scale_bias_size);
SetCommonTensor(bias_, bias_dims, bias.data());
SetCommonTensor(bias_, bias_dims, bias.data(), {}, true);
}
}
};
Expand All @@ -149,6 +149,9 @@ TEST(LayerNorm, precision) {
Place place;
#if defined(LITE_WITH_XPU)
place = TARGET(kXPU);
#elif defined(LITE_WITH_NPU)
place = TARGET(kNPU);
abs_error = 1e-2;
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
abs_error = 6e-5;
Expand All @@ -157,7 +160,7 @@ TEST(LayerNorm, precision) {
#endif

for (auto dims :
std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) {
std::vector<std::vector<int64_t>>{{2, 3, 4, 5}, {3, 4, 5}, {4, 5}}) {
for (auto epsilon : {1e-5f}) {
for (auto axis : {1, 2, 3}) {
for (bool has_bias : {true, false}) {
Expand Down

0 comments on commit 2ac5fe3

Please sign in to comment.