From f04917b53090c73a5fa7175b3d3ee25b633ad82f Mon Sep 17 00:00:00 2001 From: daming5432 Date: Wed, 3 Feb 2021 02:46:02 +0000 Subject: [PATCH 01/11] fuse conv prelu pass test=develop --- .../cl_kernel/image/conv2d_1x1_opt_kernel.cl | 59 ++++++++- .../cl_kernel/image/conv2d_3x3_kernel.cl | 18 ++- .../cl_kernel/image/conv2d_3x3_opt_kernel.cl | 114 +++++++++++++++++- .../cl_kernel/image/conv2d_5x5_kernel.cl | 20 ++- .../cl_kernel/image/conv2d_5x5_opt_kernel.cl | 114 +++++++++++++++++- .../cl_kernel/image/conv2d_7x7_kernel.cl | 18 ++- .../cl_kernel/image/conv2d_7x7_opt_kernel.cl | 114 +++++++++++++++++- .../cl_kernel/image/conv2d_common_kernel.cl | 49 +++++++- .../image/depthwise_conv2d_basic_kernel.cl | 18 ++- .../image/depthwise_conv2d_kernel.cl | 46 ++++++- .../mir/fusion/conv_activation_fuse_pass.cc | 10 +- lite/core/mir/fusion/conv_activation_fuser.cc | 14 +++ lite/core/mir/fusion/conv_activation_fuser.h | 5 +- lite/kernels/arm/conv_compute.cc | 6 + lite/kernels/opencl/conv_image_compute.cc | 57 +++++++++ lite/kernels/opencl/conv_image_compute.h | 2 + lite/operators/conv_op.h | 8 ++ 17 files changed, 654 insertions(+), 18 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index 57880dfec43..b44107051cd 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -21,7 +21,8 @@ __kernel void conv2d_1x1_opt( __private const int input_height, /* of one block */ __private const int output_width, __private const int output_height, - __private const int old_w) { + __private const int old_w, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -251,10 +252,36 @@ __kernel void conv2d_1x1_opt( READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif +#ifdef PRELU_CH + CL_DTYPE4 alpha0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#endif + output0 = activation_type4(output0, alpha0); + output1 = activation_type4(output1, alpha1); + output2 = activation_type4(output2, alpha2); + output3 = activation_type4(output3, alpha3); +#else output0 = activation_type4(output0); output1 = activation_type4(output1); output2 = activation_type4(output2); output3 = activation_type4(output3); +#endif #ifdef SCALE_ACTIVATION output0 = fuse_scale(output0, 1.f, 0.f, 0.f); @@ -301,7 +328,8 @@ __kernel void conv2d_1x1_simple( __private const int input_height, /* of one block */ __private const int output_width, __private const int output_height, - __private const int old_w) { + __private const int old_w, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); const int out_nh = get_global_id(2); @@ -421,10 +449,37 @@ __kernel void conv2d_1x1_simple( READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#endif + output0 = activation_type4(output0, alpha0); + output1 = activation_type4(output1, alpha1); + output2 = activation_type4(output2, alpha2); + output3 = activation_type4(output3, alpha3); +#else output0 = activation_type4(output0); output1 = activation_type4(output1); output2 = activation_type4(output2); output3 = activation_type4(output3); +#endif #ifdef SCALE_ACTIVATION output0 = fuse_scale(output0, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 4e16ead836f..3b4d457789d 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -34,7 +34,8 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, __private const int filter_width, __private const int filter_height, __private const int group, - __private const int input_tensor_c) { + __private const int input_tensor_c, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -251,7 +252,22 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, output.w = (i == 3) ? output.w + tmp_out : output.w; } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; +#endif + output = activation_type4(output, alpha0); +#else output = activation_type4(output); +#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index 02a0c778103..f249c350a56 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -29,7 +29,8 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // item_id const int item_ch_id = get_global_id(0); const int item_w_id = get_global_id(1); @@ -216,11 +217,65 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha[5]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha[5]; + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); +#else output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); output[2] = activation_type4(output[2]); output[3] = activation_type4(output[3]); output[4] = activation_type4(output[4]); +#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -276,7 +331,8 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // item_id const int item_ch_id = get_global_id(0); const int item_w_id = get_global_id(1); @@ -464,11 +520,65 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha[5]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha[5]; + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); +#else output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); output[2] = activation_type4(output[2]); output[3] = activation_type4(output[3]); output[4] = activation_type4(output[4]); +#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl index 587d488e9ab..c73f3112fbe 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -32,7 +32,8 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ __private const int output_width, - __private const int output_height) { + __private const int output_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -172,7 +173,22 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif - output = activation_type4(output); +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; +#endif + output = activation_type4(output, alpha0); +#else + output = activation_type4(output); +#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl index 927d56d5ac2..e6428b19a5f 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -30,7 +30,8 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 5; const int filter_h = 5; @@ -222,11 +223,65 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha[5]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha[5]; + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); +#else output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); output[2] = activation_type4(output[2]); output[3] = activation_type4(output[3]); output[4] = activation_type4(output[4]); +#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -281,7 +336,8 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 5; const int filter_h = 5; @@ -477,11 +533,65 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha[5]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha[5]; + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); +#else output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); output[2] = activation_type4(output[2]); output[3] = activation_type4(output[3]); output[4] = activation_type4(output[4]); +#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 67ffea51539..656def2d676 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -18,7 +18,8 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ __private const int output_width, - __private const int output_height) { + __private const int output_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -123,7 +124,22 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; +#endif + output = activation_type4(output, alpha0); +#else output = activation_type4(output); +#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl index cc64137fcd5..d02a2195e94 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -30,7 +30,8 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 7; const int filter_h = 7; @@ -222,11 +223,65 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha[5]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha[5]; + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); +#else output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); output[2] = activation_type4(output[2]); output[3] = activation_type4(output[3]); output[4] = activation_type4(output[4]); +#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -281,7 +336,8 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, __private const int in_w, __private const int in_h, __private const int out_w, - __private const int out_h) { + __private const int out_h, + __read_only image2d_t prelu_alpha) { // filter const int filter_w = 7; const int filter_h = 7; @@ -477,11 +533,65 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha[5]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[1] = alpha[0]; + alpha[2] = alpha[0]; + alpha[3] = alpha[0]; + alpha[4] = alpha[0]; + +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha[5]; + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, item_h_id)); + if (out_w_id1 < out_w) { + alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, item_h_id)); + } + if (out_w_id2 < out_w) { + alpha[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, item_h_id)); + } + if (out_w_id3 < out_w) { + alpha[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, item_h_id)); + } + if (out_w_id4 < out_w) { + alpha[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id4, item_h_id)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); + output[2] = activation_type4(output[2], alpha[2]); + output[3] = activation_type4(output[3], alpha[3]); + output[4] = activation_type4(output[4], alpha[4]); +#else output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); output[2] = activation_type4(output[2]); output[3] = activation_type4(output[3]); output[4] = activation_type4(output[4]); +#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl index 962fa3a7182..2e5ef6b2a93 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl @@ -33,7 +33,8 @@ __kernel void conv2d_common(__private const int global_size_dim0, __private const int padding_width, __private const int padding_height, __private const int dilation_width, - __private const int dilation_height) { + __private const int dilation_height, + __read_only image2d_t prelu_alpha) { const int out_channel_block_idx = get_global_id(0); const int out_width_block_idx = get_global_id(1); const int output_bh_idx = get_global_id(2); @@ -152,10 +153,56 @@ __kernel void conv2d_common(__private const int global_size_dim0, } } } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha_base = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_channel_block_idx, 0)); + CL_DTYPE4 alpha0 = alpha_base; + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, output_bh_idx)); + if (out_w_id1 < output_width) { + alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, output_bh_idx)); + } + if (out_w_id2 < output_width) { + alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, output_bh_idx)); + } + if (out_w_id3 < output_width) { + alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, output_bh_idx)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha1 = alpha0; + CL_DTYPE4 alpha2 = alpha0; + CL_DTYPE4 alpha3 = alpha0; +#endif + out0 = activation_type4(out0, alpha0); + out1 = activation_type4(out1, alpha1); + out2 = activation_type4(out2, alpha2); + out3 = activation_type4(out3, alpha3); +#else out0 = activation_type4(out0); out1 = activation_type4(out1); out2 = activation_type4(out2); out3 = activation_type4(out3); +#endif #ifdef SCALE_ACTIVATION out0 = fuse_scale(out0, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index 8c41fe208f7..d66cdf837b6 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -35,7 +35,8 @@ __kernel void depth_conv2d(__private const int global_size_dim0, __private const int output_width, __private const int output_height, __private const int filter_width, - __private const int filter_height) { + __private const int filter_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -94,7 +95,22 @@ __kernel void depth_conv2d(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; +#endif + output = activation_type4(output, alpha0); +#else output = activation_type4(output); +#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 729298a4b6b..3562a3df3a3 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -30,7 +30,8 @@ __kernel void depth_conv2d_3x3( __private const int input_width, /* of one block */ __private const int input_height, /* of one block */ __private const int output_width, - __private const int output_height) { + __private const int output_height, + __read_only image2d_t prelu_alpha) { const int out_c = get_global_id(0); const int out_w = get_global_id(1); @@ -205,7 +206,22 @@ __kernel void depth_conv2d_3x3( output += inputs[i] * filters[i]; } +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha0 = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; +#endif + output = activation_type4(output, alpha0); +#else output = activation_type4(output); +#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); @@ -252,7 +268,8 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, __private const int in_w, /* of one block */ __private const int in_h, /* of one block */ __private const int ou_w, - __private const int ou_h) { + __private const int ou_h, + __read_only image2d_t prelu_alpha) { const int ou_ch_blk_id = get_global_id(0); const int ou_w_blk_id = get_global_id(1); @@ -363,8 +380,33 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, output[0] = mad(inputs[10], filters[8], output[0]); output[1] = mad(inputs[11], filters[8], output[1]); +#ifdef PRELU +#ifdef PRELU_CH + CL_DTYPE4 alpha[2]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_ch_blk_id, 0)); + alpha[1] = alpha[0]; +#elif defined(PRELU_ELE) + CL_DTYPE4 alpha[2]; + alpha[0] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x, ou_nh_id)); + if (ou_col_id + 1 < ou_w) { + alpha[1] = + READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x + 1, ou_nh_id)); + } +#else + CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + CL_DTYPE4 alpha[2] = {alpha0, alpha0}; +#endif + output[0] = activation_type4(output[0], alpha[0]); + output[1] = activation_type4(output[1], alpha[1]); +#else output[0] = activation_type4(output[0]); output[1] = activation_type4(output[1]); +#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index 5d8a1fece3a..20c0b978cec 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -56,6 +56,7 @@ void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { act_types.push_back("leaky_relu"); act_types.push_back("hard_swish"); act_types.push_back("hard_sigmoid"); + act_types.push_back("prelu"); } if (!has_int8 && has_cuda) { act_types.push_back("leaky_relu"); @@ -64,10 +65,17 @@ void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { act_types.push_back("relu"); act_types.push_back("relu6"); } + bool has_alpha = false; for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) { for (auto act_type : act_types) { + if (act_type == "prelu") { + has_alpha = true; + } else { + has_alpha = false; + } for (auto has_bias : {true, false}) { - fusion::ConvActivationFuser fuser(conv_type, act_type, has_bias); + fusion::ConvActivationFuser fuser( + conv_type, act_type, has_bias, has_alpha); fuser(graph.get()); } } diff --git a/lite/core/mir/fusion/conv_activation_fuser.cc b/lite/core/mir/fusion/conv_activation_fuser.cc index 413941d301a..fc978b98ce8 100644 --- a/lite/core/mir/fusion/conv_activation_fuser.cc +++ b/lite/core/mir/fusion/conv_activation_fuser.cc @@ -28,9 +28,13 @@ void ConvActivationFuser::BuildPattern() { auto* filter = VarNode("filter")->assert_is_op_input(conv_type_, "Filter")->AsInput(); PMNode* bias = nullptr; + PMNode* alpha = nullptr; if (has_bias_) { bias = VarNode("bias")->assert_is_op_input(conv_type_, "Bias")->AsInput(); } + if (has_alpha_) { + alpha = VarNode("alpha")->assert_is_op_input(act_type_, "Alpha")->AsInput(); + } auto* conv2d = OpNode("conv2d", conv_type_)->AsIntermediate(); auto* act = OpNode("act", act_type_)->AsIntermediate(); @@ -49,6 +53,9 @@ void ConvActivationFuser::BuildPattern() { if (has_bias_) { *bias >> *conv2d; } + if (has_alpha_) { + *alpha >> *act; + } } void ConvActivationFuser::InsertNewNode(SSAGraph* graph, @@ -67,6 +74,9 @@ void ConvActivationFuser::InsertNewNode(SSAGraph* graph, if (has_bias_) { IR_NODE_LINK_TO(matched.at("bias"), new_op_node); } + if (has_alpha_) { + IR_NODE_LINK_TO(matched.at("alpha"), new_op_node); + } IR_NODE_LINK_TO(new_op_node, matched.at("output")); } @@ -97,6 +107,10 @@ cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) { float offset = act_op_desc.GetAttr("offset"); op_desc.SetAttr("slope", slope); op_desc.SetAttr("offset", offset); + } else if (act_type_ == "prelu") { + auto prelu_mode = act_op_desc.GetAttr("mode"); + op_desc.SetAttr("prelu_mode", prelu_mode); + op_desc.SetInput("prelu_alpha", {matched.at("alpha")->arg()->name}); } return op_desc; } diff --git a/lite/core/mir/fusion/conv_activation_fuser.h b/lite/core/mir/fusion/conv_activation_fuser.h index d352a32f9f8..04951b4c2cb 100644 --- a/lite/core/mir/fusion/conv_activation_fuser.h +++ b/lite/core/mir/fusion/conv_activation_fuser.h @@ -27,10 +27,12 @@ class ConvActivationFuser : public FuseBase { public: explicit ConvActivationFuser(const std::string& conv_type, const std::string& act_type, - bool has_bias) { + bool has_bias, + bool has_alpha) { conv_type_ = conv_type; act_type_ = act_type; has_bias_ = has_bias; + has_alpha_ = has_alpha; } void BuildPattern() override; @@ -41,6 +43,7 @@ class ConvActivationFuser : public FuseBase { std::string conv_type_; std::string act_type_; bool has_bias_; + bool has_alpha_; }; } // namespace fusion diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index 464ac7d646c..2a0364951c7 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -230,6 +230,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kFP16, kNCHW, ConvFp16, def) REGISTER_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, ConvFp32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))}) .BindPaddleOpVersion("conv2d", 1) @@ -238,6 +239,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, ConvFp32, def) REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, ConvFp32, def) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))}) .BindPaddleOpVersion("depthwise_conv2d", 1) @@ -246,6 +248,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, ConvFp32, def) REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -256,6 +259,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -267,6 +271,7 @@ REGISTER_LITE_KERNEL( depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -278,6 +283,7 @@ REGISTER_LITE_KERNEL( depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 97f8be08bcc..b8f0459b2fd 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -334,6 +334,37 @@ void ConvImageCompute::PrepareForRun() { std::to_string(conv_param_->activation_param.hard_sigmoid_offset); build_options_single += " -DHARD_SIGMOID -DHARD_SIGMOID_SLOPE=" + slope + "f" + " -DHARD_SIGMOID_OFFSET=" + offset + "f"; + } else if (conv_param_->activation_param.active_type == + lite_api::ActivationType::kPRelu) { + std::string prelu_mode = conv_param_->activation_param.Prelu_mode; + build_options_single += " -DPRELU"; + if (prelu_mode == "channel") { + build_options_single += " -DPRELU_CH"; + } else if (prelu_mode == "element") { + build_options_single += " -DPRELU_ELE"; + } else { + build_options_single += " -DPRELU_ALL"; + } + alpha_gpu_image_ = std::unique_ptr(new Tensor); + CLImageConverterFolder alpha_converter; + const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( + conv_param_->activation_param.Prelu_alpha->dims()); + std::unique_ptr tensor_hold_alpha_image = + std::unique_ptr(new Tensor); + tensor_hold_alpha_image->Resize( + {1, alpha_image_dims[0], alpha_image_dims[1], 4}); + auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image); + auto* alpha_cpu_data = + conv_param_->activation_param.Prelu_alpha->mutable_data(); + alpha_converter.NCHWToImage( + alpha_cpu_data, + alpha_image_data, + conv_param_->activation_param.Prelu_alpha->dims()); + MUTABLE_DATA_GPU(alpha_gpu_image_, + alpha_image_dims[0], + alpha_image_dims[1], + alpha_image_data); + alpha_image_p_ = DATA_GPU(alpha_gpu_image_); } else { LOG(FATAL) << "Unsupported activation type:" << static_cast(conv_param_->activation_param.active_type); @@ -675,6 +706,8 @@ void ConvImageCompute::Conv2d1x1opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(16, default_w_blk_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(17, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d3x3() { @@ -721,6 +754,8 @@ void ConvImageCompute::Conv2d3x3() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(20, input_tensor_c_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(21, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d3x3opt() { @@ -756,6 +791,8 @@ void ConvImageCompute::Conv2d3x3opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d5x5() { @@ -790,6 +827,8 @@ void ConvImageCompute::Conv2d5x5() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(14, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d5x5opt() { @@ -825,6 +864,8 @@ void ConvImageCompute::Conv2d5x5opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d7x7() { @@ -859,6 +900,8 @@ void ConvImageCompute::Conv2d7x7() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(13, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(14, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2d7x7opt() { @@ -894,6 +937,8 @@ void ConvImageCompute::Conv2d7x7opt() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::DepthwiseConv2d3x3s1() { @@ -927,6 +972,8 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(14, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(15, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::DepthwiseConv2d3x3() { @@ -963,6 +1010,8 @@ void ConvImageCompute::DepthwiseConv2d3x3() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(15, output_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(16, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::DepthwiseConv2d() { @@ -1003,6 +1052,8 @@ void ConvImageCompute::DepthwiseConv2d() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(17, filter_tensor_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(18, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Conv2dCommon() { @@ -1047,6 +1098,8 @@ void ConvImageCompute::Conv2dCommon() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(19, dilation_h_); CL_CHECK_FATAL(status_); + status_ = kernel_.setArg(20, *alpha_image_p_); + CL_CHECK_FATAL(status_); } void ConvImageCompute::Run() { @@ -1157,6 +1210,7 @@ REGISTER_LITE_KERNEL(conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1176,6 +1230,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1196,6 +1251,7 @@ REGISTER_LITE_KERNEL(conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1215,6 +1271,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 0e66a9701cd..f96887e1f11 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -85,6 +85,7 @@ class ConvImageCompute : public KernelLite filter_gpu_image_{nullptr}; std::unique_ptr bias_gpu_image_{nullptr}; + std::unique_ptr alpha_gpu_image_{nullptr}; std::unique_ptr tensor_hold_filter_image_{nullptr}; std::unique_ptr tensor_hold_bias_image_{nullptr}; cl::NDRange global_work_size_ = cl::NDRange{ @@ -98,6 +99,7 @@ class ConvImageCompute : public KernelLite("slope"); param_.activation_param.hard_sigmoid_offset = op_desc.GetAttr("offset"); + } else if (act_type == "prelu") { + param_.activation_param.active_type = lite_api::ActivationType::kPRelu; + param_.activation_param.Prelu_mode = + op_desc.GetAttr("prelu_mode"); + auto prelu_alpha_name = op_desc.Input("prelu_alpha").front(); + auto prelu_alpha_var = scope->FindVar(prelu_alpha_name); + param_.activation_param.Prelu_alpha = + const_cast(&(prelu_alpha_var->Get())); } else { LOG(FATAL) << "The fused conv only supports fuse with relu, leaky " "relu, hard_swish, while the given activation type is " From f79d766c19d3784c4d9d6712a733ed3309106456 Mon Sep 17 00:00:00 2001 From: daming5432 Date: Wed, 3 Feb 2021 03:13:10 +0000 Subject: [PATCH 02/11] add #ifdef PRELU to conv2d_1x1_opt_kernel.cl test=develop --- lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl | 1 + 1 file changed, 1 insertion(+) diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index b44107051cd..eeb72e09ddf 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -252,6 +252,7 @@ __kernel void conv2d_1x1_opt( READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif +#ifdef PRELU #ifdef PRELU_CH CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); From 718c4d030672ff70650b94b73cfbabeaee76951f Mon Sep 17 00:00:00 2001 From: daming5432 Date: Fri, 19 Feb 2021 06:11:09 +0000 Subject: [PATCH 03/11] modify activation and activation_type4, fix prelu bug test=develop --- .../buffer/depthwise_conv2d_kernel.cl | 3 +- .../buffer/elementwise_add_kernel.cl | 3 +- .../opencl/cl_kernel/buffer/fc_kernel.cl | 18 +++-- .../opencl/cl_kernel/buffer/relu_kernel.cl | 3 +- lite/backends/opencl/cl_kernel/cl_common.h | 14 +--- .../cl_kernel/image/conv2d_1x1_opt_kernel.cl | 70 ++++++++----------- .../cl_kernel/image/conv2d_3x3_kernel.cl | 15 ++-- .../cl_kernel/image/conv2d_3x3_opt_kernel.cl | 50 ++++--------- .../cl_kernel/image/conv2d_5x5_kernel.cl | 14 ++-- .../cl_kernel/image/conv2d_5x5_opt_kernel.cl | 50 ++++--------- .../cl_kernel/image/conv2d_7x7_kernel.cl | 11 ++- .../cl_kernel/image/conv2d_7x7_opt_kernel.cl | 50 ++++--------- .../cl_kernel/image/conv2d_common_kernel.cl | 43 +++++------- .../image/depthwise_conv2d_basic_kernel.cl | 12 ++-- .../image/depthwise_conv2d_kernel.cl | 34 ++++----- .../cl_kernel/image/elementwise_add_kernel.cl | 3 +- .../cl_kernel/image/elementwise_sub_kernel.cl | 3 +- lite/core/mir/fusion/conv_activation_fuser.cc | 2 +- lite/kernels/arm/conv_compute.cc | 17 +++-- .../opencl/activation_image_compute.cc | 61 +++++++++++----- lite/kernels/opencl/conv_image_compute.cc | 8 +-- lite/operators/conv_op.h | 2 +- 22 files changed, 203 insertions(+), 283 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl index ab575ba9b38..cc45f66859b 100644 --- a/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/depthwise_conv2d_kernel.cl @@ -62,7 +62,8 @@ __kernel void depthwise_conv2d(const int numel, // num of elements v += bias_data[c]; } #ifdef RELU - output_data[index] = activation(v); + CL_DTYPE alpha; + output_data[index] = activation(v, alpha); #else output_data[index] = v; #endif diff --git a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl index bb6faea629c..05566798ad7 100644 --- a/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/elementwise_add_kernel.cl @@ -37,7 +37,8 @@ __kernel void elementwise_add(__global const CL_DTYPE* x_data, for (int n = 0; n < num; ++n) { // n: [0, h*w) *dout_ptr = *din_ptr + diny_data; #ifdef RELU - *dout_ptr = activation(*dout_ptr); + CL_DTYPE alpha; + *dout_ptr = activation(*dout_ptr, alpha); #endif ++dout_ptr; ++din_ptr; diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl index 080ce2b4574..b48b83e788a 100644 --- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl @@ -54,7 +54,8 @@ void fc_gemm_naive(__global const CL_DTYPE* a, } #ifdef RELU - c[row * N + col] = activation(c0); + CL_DTYPE alpha; + c[row * N + col] = activation(c0, alpha); #else c[row * N + col] = c0; #endif @@ -91,7 +92,8 @@ void gemm_batch_naive(__global const CL_DTYPE* a, c0 += a0 * b0; } - cur_c[row * N + col] = activation(c0); + CL_DTYPE alpha; + cur_c[row * N + col] = activation(c0, alpha); } @@ -235,7 +237,8 @@ void fc_gemv_naive(__global const CL_DTYPE* a, } #ifdef RELU - c[col] = activation(c0); + CL_DTYPE alpha; + c[col] = activation(c0, alpha); #else c[col] = c0; #endif @@ -254,6 +257,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, const int M, const int N, const int K) { const int col = get_global_id(0) << 2; // gws[0]: [0, N >> 2) height of B == N + half alpha; if (col + 3 < N) { half4 c0 = 0.0f; if (bias) { @@ -310,11 +314,11 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, } else { switch (col % 4) { case 3: - c[col + 2] = activation(c0.z); + c[col + 2] = activation(c0.z, alpha); case 2: - c[col + 1] = activation(c0.y); + c[col + 1] = activation(c0.y, alpha); case 1: - c[col] = activation(c0.x); + c[col] = activation(c0.x, alpha); } } #else @@ -341,7 +345,7 @@ void fc_gemv_1x4(__global const CL_DTYPE* a, c0 += a0 * b0; } #ifdef RELU - c[col + col_offset] = activation(c0); + c[col + col_offset] = activation(c0, alpha); #else c[col + col_offset] = c0; #endif diff --git a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl index b07dc8132f4..fcf74685924 100644 --- a/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl +++ b/lite/backends/opencl/cl_kernel/buffer/relu_kernel.cl @@ -16,7 +16,8 @@ limitations under the License. */ __kernel void relu(__global const CL_DTYPE* x_data, const int count, __global CL_DTYPE* out_data) { const int index = get_global_id(0); + CL_DTYPE alpha; if (index < count) { - out_data[index] = activation(x_data[index]); + out_data[index] = activation(x_data[index], alpha); } } diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h index a8013e4ec4c..6998051e1e9 100644 --- a/lite/backends/opencl/cl_kernel/cl_common.h +++ b/lite/backends/opencl/cl_kernel/cl_common.h @@ -90,12 +90,7 @@ __constant sampler_t SAMPLER = ///////////////////////////////// // activation / activation_type4 ///////////////////////////////// -inline CL_DTYPE activation(CL_DTYPE in -#ifdef PRELU - , - CL_DTYPE prelu_alpha -#endif - ) { +inline CL_DTYPE activation(CL_DTYPE in, CL_DTYPE prelu_alpha) { CL_DTYPE output = in; #ifdef PRELU output = select(prelu_alpha * in, in, in >= (CL_DTYPE)0); @@ -138,12 +133,7 @@ inline CL_DTYPE activation(CL_DTYPE in return output; } -inline CL_DTYPE4 activation_type4(CL_DTYPE4 in -#ifdef PRELU - , - CL_DTYPE4 prelu_alpha -#endif - ) { +inline CL_DTYPE4 activation_type4(CL_DTYPE4 in, CL_DTYPE4 prelu_alpha) { CL_DTYPE4 output = in; #ifdef PRELU output = select(prelu_alpha * in, in, isgreaterequal(in, (CL_DTYPE4)0)); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index eeb72e09ddf..c2ea9a29c7b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -252,37 +252,30 @@ __kernel void conv2d_1x1_opt( READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif -#ifdef PRELU +CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; #ifdef PRELU_CH - CL_DTYPE4 alpha0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; +#elif defined(PRELU_ALL) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; #endif output0 = activation_type4(output0, alpha0); output1 = activation_type4(output1, alpha1); output2 = activation_type4(output2, alpha2); output3 = activation_type4(output3, alpha3); -#else - output0 = activation_type4(output0); - output1 = activation_type4(output1); - output2 = activation_type4(output2); - output3 = activation_type4(output3); -#endif #ifdef SCALE_ACTIVATION output0 = fuse_scale(output0, 1.f, 0.f, 0.f); @@ -450,37 +443,30 @@ __kernel void conv2d_1x1_simple( READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif -#ifdef PRELU +CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; #ifdef PRELU_CH - CL_DTYPE4 alpha0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; +#elif defined(PRELU_ALL) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; #endif output0 = activation_type4(output0, alpha0); output1 = activation_type4(output1, alpha1); output2 = activation_type4(output2, alpha2); output3 = activation_type4(output3, alpha3); -#else - output0 = activation_type4(output0); - output1 = activation_type4(output1); - output2 = activation_type4(output2); - output3 = activation_type4(output3); -#endif #ifdef SCALE_ACTIVATION output0 = fuse_scale(output0, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index 3b4d457789d..d22137e5046 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -252,22 +252,19 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, output.w = (i == 3) ? output.w + tmp_out : output.w; } } -#ifdef PRELU + +CL_DTYPE4 alpha0; #ifdef PRELU_CH - CL_DTYPE4 alpha0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#elif defined(PRELU_ALL) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; #endif output = activation_type4(output, alpha0); -#else - output = activation_type4(output); -#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index f249c350a56..ff5dd97a3d4 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -217,18 +217,15 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, } } -#ifdef PRELU +CL_DTYPE4 alpha[5]; #ifdef PRELU_CH - CL_DTYPE4 alpha[5]; - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha[5]; alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -257,25 +254,17 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#elif defined(PRELU_ALL) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); output[2] = activation_type4(output[2], alpha[2]); output[3] = activation_type4(output[3], alpha[3]); output[4] = activation_type4(output[4], alpha[4]); -#else - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); -#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -520,18 +509,15 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, } } -#ifdef PRELU +CL_DTYPE4 alpha[5]; #ifdef PRELU_CH - CL_DTYPE4 alpha[5]; - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha[5]; alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -560,25 +546,17 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#elif defined(PRELU_ALL) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); output[2] = activation_type4(output[2], alpha[2]); output[3] = activation_type4(output[3], alpha[3]); output[4] = activation_type4(output[4], alpha[4]); -#else - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); -#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl index c73f3112fbe..df14c294d6c 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -173,22 +173,18 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif -#ifdef PRELU +CL_DTYPE4 alpha0; #ifdef PRELU_CH - CL_DTYPE4 alpha0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#elif defined(PRELU_ALL) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; #endif output = activation_type4(output, alpha0); -#else - output = activation_type4(output); -#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl index e6428b19a5f..c58994d8f2d 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -223,18 +223,15 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, } } -#ifdef PRELU +CL_DTYPE4 alpha[5]; #ifdef PRELU_CH - CL_DTYPE4 alpha[5]; - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha[5]; alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -263,25 +260,17 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#elif defined(PRELU_ALL) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); output[2] = activation_type4(output[2], alpha[2]); output[3] = activation_type4(output[3], alpha[3]); output[4] = activation_type4(output[4], alpha[4]); -#else - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); -#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -533,18 +522,15 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, } } -#ifdef PRELU +CL_DTYPE4 alpha[5]; #ifdef PRELU_CH - CL_DTYPE4 alpha[5]; - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha[5]; alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -573,25 +559,17 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#elif defined(PRELU_ALL) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); output[2] = activation_type4(output[2], alpha[2]); output[3] = activation_type4(output[3], alpha[3]); output[4] = activation_type4(output[4], alpha[4]); -#else - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); -#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 656def2d676..9799e1942db 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -124,14 +124,13 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif -#ifdef PRELU +CL_DTYPE4 alpha0; #ifdef PRELU_CH - CL_DTYPE4 alpha0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); +#elif defined(PRELU_ALL) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl index d02a2195e94..95d6e140a8e 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -223,18 +223,15 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, } } -#ifdef PRELU +CL_DTYPE4 alpha[5]; #ifdef PRELU_CH - CL_DTYPE4 alpha[5]; - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha[5]; alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -263,25 +260,17 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#elif defined(PRELU_ALL) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); output[2] = activation_type4(output[2], alpha[2]); output[3] = activation_type4(output[3], alpha[3]); output[4] = activation_type4(output[4], alpha[4]); -#else - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); -#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); @@ -533,18 +522,15 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, } } -#ifdef PRELU +CL_DTYPE4 alpha[5]; #ifdef PRELU_CH - CL_DTYPE4 alpha[5]; - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha[5]; alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -573,25 +559,17 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha[5] = {alpha0, alpha0, alpha0, alpha0, alpha0}; +#elif defined(PRELU_ALL) + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; alpha[2] = alpha[0]; + alpha[3] = alpha[0]; alpha[4] = alpha[0]; #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); output[2] = activation_type4(output[2], alpha[2]); output[3] = activation_type4(output[3], alpha[3]); output[4] = activation_type4(output[4], alpha[4]); -#else - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); - output[2] = activation_type4(output[2]); - output[3] = activation_type4(output[3]); - output[4] = activation_type4(output[4]); -#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl index 2e5ef6b2a93..3b3f856527a 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl @@ -87,7 +87,6 @@ __kernel void conv2d_common(__private const int global_size_dim0, CL_DTYPE4 out3 = out0; #endif - int in_width0 = mad24(out_width_block_idx, stride_width << 2, -padding_width); int in_width1 = in_width0 + stride_width; int in_width2 = in_width0 + stride_width * 2; @@ -153,15 +152,13 @@ __kernel void conv2d_common(__private const int global_size_dim0, } } } -#ifdef PRELU +CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; #ifdef PRELU_CH - CL_DTYPE4 alpha_base = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_channel_block_idx, 0)); - CL_DTYPE4 alpha0 = alpha_base; - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_channel_block_idx, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -184,31 +181,25 @@ __kernel void conv2d_common(__private const int global_size_dim0, SAMPLER, (int2)(out_w_base_id + out_w_id3, output_bh_idx)); } -#else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha1 = alpha0; - CL_DTYPE4 alpha2 = alpha0; - CL_DTYPE4 alpha3 = alpha0; +#elif defined(PRELU_ALL) + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; #endif out0 = activation_type4(out0, alpha0); out1 = activation_type4(out1, alpha1); out2 = activation_type4(out2, alpha2); out3 = activation_type4(out3, alpha3); -#else - out0 = activation_type4(out0); - out1 = activation_type4(out1); - out2 = activation_type4(out2); - out3 = activation_type4(out3); -#endif #ifdef SCALE_ACTIVATION - out0 = fuse_scale(out0, 1.f, 0.f, 0.f); - out1 = fuse_scale(out1, 1.f, 0.f, 0.f); - out2 = fuse_scale(out2, 1.f, 0.f, 0.f); - out3 = fuse_scale(out3, 1.f, 0.f, 0.f); + out0 = fuse_scale(out0, 1.f, 0.f, 0.f); + out1 = fuse_scale(out1, 1.f, 0.f, 0.f); + out2 = fuse_scale(out2, 1.f, 0.f, 0.f); + out3 = fuse_scale(out3, 1.f, 0.f, 0.f); #endif const int out_x_base = mul24(out_channel_block_idx, output_width); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index d66cdf837b6..9826e38aad7 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -95,22 +95,18 @@ __kernel void depth_conv2d(__private const int global_size_dim0, READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, SAMPLER, (int2)(out_c, 0)); #endif -#ifdef PRELU +CL_DTYPE4 alpha0; #ifdef PRELU_CH - CL_DTYPE4 alpha0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); #else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; #endif output = activation_type4(output, alpha0); -#else - output = activation_type4(output); -#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 3562a3df3a3..1ae7a8ae0f4 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -206,22 +206,18 @@ __kernel void depth_conv2d_3x3( output += inputs[i] * filters[i]; } -#ifdef PRELU +CL_DTYPE4 alpha0; #ifdef PRELU_CH - CL_DTYPE4 alpha0 = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); #elif defined(PRELU_ELE) - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); #else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; #endif output = activation_type4(output, alpha0); -#else - output = activation_type4(output); -#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); @@ -380,14 +376,11 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, output[0] = mad(inputs[10], filters[8], output[0]); output[1] = mad(inputs[11], filters[8], output[1]); -#ifdef PRELU +CL_DTYPE4 alpha[2]; #ifdef PRELU_CH - CL_DTYPE4 alpha[2]; - alpha[0] = - READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_ch_blk_id, 0)); + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_ch_blk_id, 0)); alpha[1] = alpha[0]; #elif defined(PRELU_ELE) - CL_DTYPE4 alpha[2]; alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x, ou_nh_id)); if (ou_col_id + 1 < ou_w) { @@ -395,18 +388,15 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x + 1, ou_nh_id)); } #else - CL_DTYPE4 alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); - alpha0.y = alpha0.x; - alpha0.z = alpha0.x; - alpha0.w = alpha0.x; - CL_DTYPE4 alpha[2] = {alpha0, alpha0}; + alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha[0].y = alpha[0].x; + alpha[0].z = alpha[0].x; + alpha[0].w = alpha[0].x; + alpha[1] = alpha[0]; + #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); -#else - output[0] = activation_type4(output[0]); - output[1] = activation_type4(output[1]); -#endif #ifdef SCALE_ACTIVATION output[0] = fuse_scale(output[0], 1.f, 0.f, 0.f); diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl index 0dc287ed847..a7cc062b664 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl @@ -33,7 +33,8 @@ __kernel void elementwise_add(__read_only image2d_t input, CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords); #endif CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords); - CL_DTYPE4 output = activation_type4(in + biase); + CL_DTYPE4 alpha; + CL_DTYPE4 output = activation_type4(in + biase, alpha); WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output); } diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl index 3bcc2159705..c31131f15ae 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_sub_kernel.cl @@ -26,7 +26,8 @@ __kernel void elementwise_sub(__read_only image2d_t input, CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords); CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords); - CL_DTYPE4 output = activation_type4(in - biase); + CL_DTYPE4 alpha; + CL_DTYPE4 output = activation_type4(in - biase, alpha); WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output); } diff --git a/lite/core/mir/fusion/conv_activation_fuser.cc b/lite/core/mir/fusion/conv_activation_fuser.cc index fc978b98ce8..49c5029ff9c 100644 --- a/lite/core/mir/fusion/conv_activation_fuser.cc +++ b/lite/core/mir/fusion/conv_activation_fuser.cc @@ -110,7 +110,7 @@ cpp::OpDesc ConvActivationFuser::GenOpDesc(const key2nodes_t& matched) { } else if (act_type_ == "prelu") { auto prelu_mode = act_op_desc.GetAttr("mode"); op_desc.SetAttr("prelu_mode", prelu_mode); - op_desc.SetInput("prelu_alpha", {matched.at("alpha")->arg()->name}); + op_desc.SetInput("Prelu_alpha", {matched.at("alpha")->arg()->name}); } return op_desc; } diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index 2a0364951c7..5faf1d02f48 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -221,6 +221,7 @@ typedef paddle::lite::kernels::arm::ConvCompute(new Tensor); kernel_func_name_ = "prelu_channel"; auto& out_dims = act_param_->Out->dims(); - width_ = out_dims[3]; + if (out_dims.size() == 4) { + width_ = out_dims[3]; + CLImageConverterFolder alpha_converter; + const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( + act_param_->Prelu_alpha->dims()); + tensor_hold_alpha_image_->Resize( + {1, alpha_image_dims[0], alpha_image_dims[1], 4}); - CLImageConverterFolder alpha_converter; - const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( - act_param_->Prelu_alpha->dims()); - tensor_hold_alpha_image_->Resize( - {1, alpha_image_dims[0], alpha_image_dims[1], 4}); + auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image_); + auto* alpha_cpu_data = + act_param_->Prelu_alpha->mutable_data(); + alpha_converter.NCHWToImage(alpha_cpu_data, + alpha_image_data, + act_param_->Prelu_alpha->dims()); - auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image_); - auto* alpha_cpu_data = act_param_->Prelu_alpha->mutable_data(); - alpha_converter.NCHWToImage(alpha_cpu_data, - alpha_image_data, - act_param_->Prelu_alpha->dims()); + MUTABLE_DATA_GPU(alpha_gpu_image_, + alpha_image_dims[0], + alpha_image_dims[1], + alpha_image_data); + } else if (out_dims.size() == 2) { + width_ = 1; + CLImageConverterDefault alpha_converter; + const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( + act_param_->Prelu_alpha->dims()); + tensor_hold_alpha_image_->Resize( + {1, alpha_image_dims[0], alpha_image_dims[1], 4}); - MUTABLE_DATA_GPU(alpha_gpu_image_, - alpha_image_dims[0], - alpha_image_dims[1], - alpha_image_data); + auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image_); + auto* alpha_cpu_data = + act_param_->Prelu_alpha->mutable_data(); + alpha_converter.NCHWToImage(alpha_cpu_data, + alpha_image_data, + act_param_->Prelu_alpha->dims()); + + MUTABLE_DATA_GPU(alpha_gpu_image_, + alpha_image_dims[0], + alpha_image_dims[1], + alpha_image_data); + } else { + LOG(FATAL) << "unsupport dims.size(): " << out_dims.size(); + } } else { alpha_gpu_image_ = std::unique_ptr(new Tensor); tensor_hold_alpha_image_ = std::unique_ptr(new Tensor); kernel_func_name_ = "prelu_element"; auto& in_dim = act_param_->X->dims(); - height_ = in_dim[2]; + if (in_dim.size() > 3) { + height_ = in_dim[2]; + } else { + height_ = 1; + } scale_ = act_param_->Leaky_relu_alpha; - CLImageConverterFolder alpha_converter; + CLImageConverterDefault alpha_converter; const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( act_param_->Prelu_alpha->dims()); tensor_hold_alpha_image_->Resize( diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index b8f0459b2fd..26d66d7c01b 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -1210,7 +1210,7 @@ REGISTER_LITE_KERNEL(conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1230,7 +1230,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))}) - .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1251,7 +1251,7 @@ REGISTER_LITE_KERNEL(conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), @@ -1271,7 +1271,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, DATALAYOUT(kImageDefault))}) .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kHost))}) - .BindInput("prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Prelu_alpha", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL), PRECISION(kFP16), diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index 9245d280e99..a99736ee422 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -140,7 +140,7 @@ class ConvOpLite : public OpLite { param_.activation_param.active_type = lite_api::ActivationType::kPRelu; param_.activation_param.Prelu_mode = op_desc.GetAttr("prelu_mode"); - auto prelu_alpha_name = op_desc.Input("prelu_alpha").front(); + auto prelu_alpha_name = op_desc.Input("Prelu_alpha").front(); auto prelu_alpha_var = scope->FindVar(prelu_alpha_name); param_.activation_param.Prelu_alpha = const_cast(&(prelu_alpha_var->Get())); From 8d1c252e934fc6e399c0aec3faf1faba8849b7ff Mon Sep 17 00:00:00 2001 From: daming5432 Date: Tue, 23 Feb 2021 11:33:52 +0000 Subject: [PATCH 04/11] test=develop From f07185252666fdc2fa36ada0410153adef971c66 Mon Sep 17 00:00:00 2001 From: daming5432 Date: Mon, 1 Mar 2021 09:10:38 +0000 Subject: [PATCH 05/11] add alpha_image_p_ init test=develop --- lite/kernels/opencl/conv_image_compute.cc | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 26d66d7c01b..1c5a4811fac 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -302,6 +302,14 @@ void ConvImageCompute::PrepareForRun() { << static_cast(conv_param_->activation_param.active_type) << " conv_param_->activation_param.has_active:" << conv_param_->activation_param.has_active; + // alpha_image_p_ init + alpha_gpu_image_ = std::unique_ptr(new Tensor); + std::unique_ptr tensor_hold_alpha_image = + std::unique_ptr(new Tensor); + tensor_hold_alpha_image->Resize({1, 1, 1, 4}); + auto* alpha_image_data = DATA_GPU(tensor_hold_alpha_image); + MUTABLE_DATA_GPU(alpha_gpu_image_, 1, 1, alpha_image_data); + alpha_image_p_ = DATA_GPU(alpha_gpu_image_); if (conv_param_->activation_param.has_active) { if (conv_param_->activation_param.active_type == lite_api::ActivationType::kRelu) { @@ -345,12 +353,9 @@ void ConvImageCompute::PrepareForRun() { } else { build_options_single += " -DPRELU_ALL"; } - alpha_gpu_image_ = std::unique_ptr(new Tensor); CLImageConverterFolder alpha_converter; const DDim& alpha_image_dims = alpha_converter.InitImageDimInfoWith( conv_param_->activation_param.Prelu_alpha->dims()); - std::unique_ptr tensor_hold_alpha_image = - std::unique_ptr(new Tensor); tensor_hold_alpha_image->Resize( {1, alpha_image_dims[0], alpha_image_dims[1], 4}); auto* alpha_image_data = MUTABLE_DATA_CPU(tensor_hold_alpha_image); From 9a386fd585104196083e80b12c9a60f960b443fc Mon Sep 17 00:00:00 2001 From: daming5432 Date: Mon, 1 Mar 2021 09:36:29 +0000 Subject: [PATCH 06/11] add annotation for prelu test=develop --- .../cl_kernel/image/conv2d_1x1_opt_kernel.cl | 18 +++++++++++------ .../cl_kernel/image/conv2d_3x3_kernel.cl | 9 ++++++--- .../cl_kernel/image/conv2d_3x3_opt_kernel.cl | 20 +++++++++++-------- .../cl_kernel/image/conv2d_5x5_kernel.cl | 9 ++++++--- .../cl_kernel/image/conv2d_5x5_opt_kernel.cl | 20 +++++++++++-------- .../cl_kernel/image/conv2d_7x7_kernel.cl | 9 ++++++--- .../cl_kernel/image/conv2d_7x7_opt_kernel.cl | 20 +++++++++++-------- .../cl_kernel/image/conv2d_common_kernel.cl | 9 ++++++--- .../image/depthwise_conv2d_basic_kernel.cl | 9 ++++++--- .../image/depthwise_conv2d_kernel.cl | 19 +++++++++++------- 10 files changed, 90 insertions(+), 52 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl index c2ea9a29c7b..67b0ccdd50e 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl @@ -253,17 +253,19 @@ __kernel void conv2d_1x1_opt( #endif CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; @@ -271,6 +273,7 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; + //} #endif output0 = activation_type4(output0, alpha0); output1 = activation_type4(output1, alpha1); @@ -444,17 +447,19 @@ __kernel void conv2d_1x1_simple( #endif CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0); alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; @@ -462,6 +467,7 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3; alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; + //} #endif output0 = activation_type4(output0, alpha0); output1 = activation_type4(output1, alpha1); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl index d22137e5046..65c119a23c4 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl @@ -254,15 +254,18 @@ __kernel void conv2d_3x3(__private const int global_size_dim0, } CL_DTYPE4 alpha0; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; + //} #endif output = activation_type4(output, alpha0); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl index ff5dd97a3d4..ebd0e021b99 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl @@ -218,14 +218,14 @@ __kernel void conv2d_3x3_opt(__private const int item_ch, } CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; - -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -254,11 +254,13 @@ CL_DTYPE4 alpha[5]; SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); @@ -510,14 +512,14 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch, } CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; - -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -546,11 +548,13 @@ CL_DTYPE4 alpha[5]; SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl index df14c294d6c..d96dc8cc11b 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl @@ -174,15 +174,18 @@ __kernel void conv2d_5x5(__private const int global_size_dim0, #endif CL_DTYPE4 alpha0; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; + //} #endif output = activation_type4(output, alpha0); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl index c58994d8f2d..b041eb73ac1 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl @@ -224,14 +224,14 @@ __kernel void conv2d_5x5_opt(__private const int item_ch, } CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; - -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -260,11 +260,13 @@ CL_DTYPE4 alpha[5]; SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); @@ -523,14 +525,14 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch, } CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; - -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -559,11 +561,13 @@ CL_DTYPE4 alpha[5]; SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 9799e1942db..092bcf49355 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -125,15 +125,18 @@ __kernel void conv2d_7x7(__private const int global_size_dim0, #endif CL_DTYPE4 alpha0; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; + //} #endif output = activation_type4(output, alpha0); #else diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl index 95d6e140a8e..497234428a8 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl @@ -224,14 +224,14 @@ __kernel void conv2d_7x7_opt(__private const int item_ch, } CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; - -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -260,11 +260,13 @@ CL_DTYPE4 alpha[5]; SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); @@ -523,14 +525,14 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch, } CL_DTYPE4 alpha[5]; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(item_ch_id, 0)); alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; - -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -559,11 +561,13 @@ CL_DTYPE4 alpha[5]; SAMPLER, (int2)(out_w_base_id + out_w_id4, item_h_id)); } -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; alpha[1] = alpha[0]; alpha[2] = alpha[0]; alpha[3] = alpha[0]; alpha[4] = alpha[0]; + //} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl index 3b3f856527a..d60440aaea5 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_common_kernel.cl @@ -153,12 +153,13 @@ __kernel void conv2d_common(__private const int global_size_dim0, } } CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_channel_block_idx, 0)); alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, @@ -181,7 +182,8 @@ CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; SAMPLER, (int2)(out_w_base_id + out_w_id3, output_bh_idx)); } -#elif defined(PRELU_ALL) + //} +#elif defined(PRELU_ALL) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; @@ -189,6 +191,7 @@ CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; alpha1 = alpha0; alpha2 = alpha0; alpha3 = alpha0; + //} #endif out0 = activation_type4(out0, alpha0); out1 = activation_type4(out1, alpha1); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index 9826e38aad7..6c10d123548 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -96,15 +96,18 @@ __kernel void depth_conv2d(__private const int global_size_dim0, #endif CL_DTYPE4 alpha0; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#else + //} +#else //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; + //} #endif output = activation_type4(output, alpha0); diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl index 1ae7a8ae0f4..c5ef38038e3 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl @@ -207,15 +207,18 @@ __kernel void depth_conv2d_3x3( } CL_DTYPE4 alpha0; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0)); -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos); -#else + //} +#else //{ alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha0.y = alpha0.x; alpha0.z = alpha0.x; alpha0.w = alpha0.x; + //} #endif output = activation_type4(output, alpha0); @@ -377,23 +380,25 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk, output[1] = mad(inputs[11], filters[8], output[1]); CL_DTYPE4 alpha[2]; -#ifdef PRELU_CH +#ifdef PRELU_CH //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_ch_blk_id, 0)); alpha[1] = alpha[0]; -#elif defined(PRELU_ELE) + //} +#elif defined(PRELU_ELE) //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x, ou_nh_id)); if (ou_col_id + 1 < ou_w) { alpha[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(ou_x + 1, ou_nh_id)); } -#else + //} +#else //{ alpha[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); alpha[0].y = alpha[0].x; alpha[0].z = alpha[0].x; alpha[0].w = alpha[0].x; alpha[1] = alpha[0]; - + //} #endif output[0] = activation_type4(output[0], alpha[0]); output[1] = activation_type4(output[1], alpha[1]); From 3be4f7953ec5562c4b39d2b8c0cfef05913c271f Mon Sep 17 00:00:00 2001 From: daming5432 Date: Mon, 1 Mar 2021 11:05:11 +0000 Subject: [PATCH 07/11] test=develop From 8fd713cbe6172b89019e3a18e217fe101fb58e13 Mon Sep 17 00:00:00 2001 From: daming5432 Date: Mon, 1 Mar 2021 13:43:44 +0000 Subject: [PATCH 08/11] rm some waste code test=develop --- lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl | 3 --- 1 file changed, 3 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl index 092bcf49355..2ffd988f180 100644 --- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl @@ -139,9 +139,6 @@ CL_DTYPE4 alpha0; //} #endif output = activation_type4(output, alpha0); -#else - output = activation_type4(output); -#endif #ifdef SCALE_ACTIVATION output = fuse_scale(output, 1.f, 0.f, 0.f); From 3c322d980f5a5883b8e1624ff9bb308f505a295a Mon Sep 17 00:00:00 2001 From: daming5432 Date: Mon, 1 Mar 2021 18:10:26 +0000 Subject: [PATCH 09/11] test=develop From fdc7dac9d23ff777a029b5bba8c97b71745f52c2 Mon Sep 17 00:00:00 2001 From: daming5432 Date: Tue, 2 Mar 2021 05:24:22 +0000 Subject: [PATCH 10/11] add support for depth_conv2d_common test=develop --- .../image/depthwise_conv2d_basic_kernel.cl | 52 +++++++++++++++++-- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl index 8d5c34553e4..717db4784b6 100755 --- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl @@ -33,7 +33,8 @@ __kernel void depth_conv2d_common(__private const int global_size_dim0, // (out_ __private const int output_width, __private const int output_height, __private const int filter_width, - __private const int filter_height) { + __private const int filter_height, + __read_only image2d_t prelu_alpha) { const int out_c_blk = get_global_id(0); // [0, (C+3)/4) const int out_w_blk = get_global_id(1); // [0, (W+3)/4) @@ -107,10 +108,51 @@ __kernel void depth_conv2d_common(__private const int global_size_dim0, // (out_ } } - out0 = activation_type4(out0); - out1 = activation_type4(out1); - out2 = activation_type4(out2); - out3 = activation_type4(out3); +CL_DTYPE4 alpha0, alpha1, alpha2, alpha3; +#ifdef PRELU_CH //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_channel_block_idx, 0)); + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#elif defined(PRELU_ELE) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id0, output_bh_idx)); + if (out_w_id1 < output_width) { + alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id1, output_bh_idx)); + } + if (out_w_id2 < output_width) { + alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id2, output_bh_idx)); + } + if (out_w_id3 < output_width) { + alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, + prelu_alpha, + SAMPLER, + (int2)(out_w_base_id + out_w_id3, output_bh_idx)); + } + //} +#elif defined(PRELU_ALL) //{ + alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0)); + alpha0.y = alpha0.x; + alpha0.z = alpha0.x; + alpha0.w = alpha0.x; + alpha1 = alpha0; + alpha2 = alpha0; + alpha3 = alpha0; + //} +#endif + out0 = activation_type4(out0, alpha0); + out1 = activation_type4(out1, alpha1); + out2 = activation_type4(out2, alpha2); + out3 = activation_type4(out3, alpha3); #ifdef SCALE_ACTIVATION out0 = fuse_scale(out0, 1.f, 0.f, 0.f); From 10add2e10930c504069dfb348ec96ea62ce59004 Mon Sep 17 00:00:00 2001 From: daming5432 Date: Tue, 2 Mar 2021 06:43:34 +0000 Subject: [PATCH 11/11] fix arg bug test=develop --- lite/kernels/opencl/conv_image_compute.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 2773984ba8f..17e1941bae8 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -1062,7 +1062,7 @@ void ConvImageCompute::DepthwiseConv2d() { CL_CHECK_FATAL(status_); status_ = kernel_.setArg(18, filter_tensor_h_); CL_CHECK_FATAL(status_); - status_ = kernel_.setArg(18, *alpha_image_p_); + status_ = kernel_.setArg(19, *alpha_image_p_); CL_CHECK_FATAL(status_); }