Skip to content

Commit

Permalink
[cherry-pick][OpenCL] collection of cherry-picks (#6266)
Browse files Browse the repository at this point in the history
* [OpenCL] Fix select fp32 compile crash (#6006)

* [Pass] Add opencl_kernel_place_correct_pass (#6037)

* [OpenCL] Fix invalid arg size in instance_norm (#6064)

* [OpenCL][Kernel] Add concat multi inputs kernel except channel is not aligned (#6075)

* [OpenCL][Bugfix] Fix target choose in opencl_kernel_place_correct_pass  (#6079)

* [OpenCL] fix kernel select of concat (#6158)

* [OpenCL] BindTargets KOpenCL for conv_conv_fuse_pass (#6125)

* test=develop

* [UTest] Loose abs_error for group_norm and instance_norm (#6188)

* loose group_norm abs_err. test=develop
  • Loading branch information
zhaoyang-star authored Jun 16, 2021
1 parent bf492d2 commit 3478dfe
Show file tree
Hide file tree
Showing 18 changed files with 685 additions and 174 deletions.
1 change: 1 addition & 0 deletions lite/api/paddle_use_passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ USE_MIR_PASS(static_kernel_pick_pass);
USE_MIR_PASS(variable_place_inference_pass);
USE_MIR_PASS(type_target_cast_pass);
USE_MIR_PASS(__fpga_kernel_place_correct_pass);
USE_MIR_PASS(opencl_kernel_place_correct_pass);
USE_MIR_PASS(generate_program_pass);

USE_MIR_PASS(io_copy_kernel_pick_pass);
Expand Down
20 changes: 9 additions & 11 deletions lite/backends/opencl/cl_kernel/cl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ limitations under the License. */
#define MAX_VALUE FLT_MAX
#define MIN_VALUE -FLT_MAX

#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))

/////////////////////////////////
// CL_DTYPE_float / CL_DTYPE_half
/////////////////////////////////
Expand Down Expand Up @@ -93,7 +95,11 @@ __constant sampler_t SAMPLER =
inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) {
CL_DTYPE output = in;
#ifdef PRELU
#ifdef CL_DTYPE_half
output = select(prelu_alpha * in, in, (ushort)(isgreaterequal(in, 0)));
#else
output = select(prelu_alpha * in, in, (uint)(isgreaterequal(in, 0)));
#endif
#endif

#ifdef RELU
Expand All @@ -105,15 +111,12 @@ inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) {
#endif

#ifdef LEAKY_RELU
#ifdef CL_DTYPE_float
output = select((CL_DTYPE)(LEAKY_RELU_ALPHA)*in,
in,
(int)(isgreaterequal(in, 0))); // NOLINT
#endif

#ifdef CL_DTYPE_half
output = select(
(CL_DTYPE)(LEAKY_RELU_ALPHA)*in, in, (ushort)(isgreaterequal(in, 0)));
#else
output = select(
(CL_DTYPE)(LEAKY_RELU_ALPHA)*in, in, (uint)(isgreaterequal(in, 0)));
#endif
#endif

Expand Down Expand Up @@ -151,11 +154,6 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in, CL_DTYPE4 prelu_alpha) {
#ifdef LEAKY_RELU
output = select(
(CL_DTYPE4)(LEAKY_RELU_ALPHA)*in, in, isgreaterequal(in, (CL_DTYPE4)0));
// same as bellow:
// output = select((CL_DTYPE4)(LEAKY_RELU_ALPHA)*in,
// in,
// (ushort4)((in.x >= 0) << 15, (in.y >= 0) << 15, (in.z >= 0)
// << 15, (in.w >= 0) << 15));
#endif

#ifdef HARD_SWISH
Expand Down
362 changes: 311 additions & 51 deletions lite/backends/opencl/cl_kernel/image/concat_kernel.cl

Large diffs are not rendered by default.

63 changes: 27 additions & 36 deletions lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -79,106 +79,97 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
int input_block = input_c / 4;
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input0 = select(
input0 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
in_pos_in_one_block.y - dilation >= input_height);
input1 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x, pos_in.y - dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input2 = select(
in_pos_in_one_block.y - dilation >= input_height);
input2 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
in_pos_in_one_block.y - dilation >= input_height);

input3 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x - dilation, pos_in.y)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
in_pos_in_one_block.y >= input_height);

input4 = select(
input4 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x, pos_in.y)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
in_pos_in_one_block.y >= input_height);
input5 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + dilation, pos_in.y)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input6 = select(
in_pos_in_one_block.y >= input_height);
input6 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
in_pos_in_one_block.y + dilation >= input_height);
input7 =
select(READ_IMG_TYPE(CL_DTYPE_CHAR,
SELECT(READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x, pos_in.y + dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input8 = select(
in_pos_in_one_block.y + dilation >= input_height);
input8 = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
zero_dtype4,
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
in_pos_in_one_block.y + dilation >= input_height);

CL_DTYPE tmp_out = 0;
for (int j = 0; j < 9; j++) {
Expand Down
8 changes: 3 additions & 5 deletions lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,17 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 5; j++) {
for (int k = 0; k < 5; k++) {
input = select(
input = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + (j - 2) * dilation,
pos_in.y + (k - 2) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
<< 15));
in_pos_in_one_block.y + (k - 2) * dilation >= input_height);
int filter_h = k;
int filter_w = j;
int filter_c = i;
Expand Down
8 changes: 3 additions & 5 deletions lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,17 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
in_pos_in_one_block.y + batch_index * input_height);
for (int j = 0; j < 7; j++) {
for (int k = 0; k < 7; k++) {
input = select(
input = SELECT(
READ_IMG_TYPE(CL_DTYPE_CHAR,
input_image,
SAMPLER,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(CL_DTYPE4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
in_pos_in_one_block.y + (k - 3) * dilation >= input_height);
int filter_h = k;
int filter_w = j;
int filter_c = i;
Expand Down
1 change: 1 addition & 0 deletions lite/core/mir/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ lite_cc_library(mir_passes
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
fpga_kernel_place_correct_pass.cc
opencl_kernel_place_correct_pass.cc
type_target_cast_pass.cc
type_layout_cast_pass.cc
type_precision_cast_pass.cc
Expand Down
5 changes: 3 additions & 2 deletions lite/core/mir/fusion/conv_conv_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
bool has_int8 = false;
bool has_weight_quant = false;
for (auto& place : graph->valid_places()) {
if (place.target == TARGET(kARM) || place.target == TARGET(kHost)) {
if (place.target == TARGET(kARM) || place.target == TARGET(kHost) ||
place.target == TARGET(kOpenCL)) {
if (place.precision == PRECISION(kInt8)) {
has_int8 = true;
}
Expand Down Expand Up @@ -77,4 +78,4 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
} // namespace paddle

REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
.BindTargets({TARGET(kARM)});
.BindTargets({TARGET(kARM), TARGET(kOpenCL)});
34 changes: 34 additions & 0 deletions lite/core/mir/opencl_kernel_place_correct_pass.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/core/mir/opencl_kernel_place_correct_pass.h"
#include <memory>
#include "lite/core/mir/pass_registry.h"

namespace paddle {
namespace lite {
namespace mir {

void OpenCLKernelPlaceCorrectPass::Apply(
const std::unique_ptr<SSAGraph> &graph) {
CorrectArgumentPlace(graph.get());
}

} // namespace mir
} // namespace lite
} // namespace paddle

REGISTER_MIR_PASS(opencl_kernel_place_correct_pass,
paddle::lite::mir::OpenCLKernelPlaceCorrectPass)
.BindTargets({TARGET(kOpenCL)});
Loading

0 comments on commit 3478dfe

Please sign in to comment.