Skip to content

Commit

Permalink
[OpenCL] Fix pool_local kernel (#8397)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoyang-star authored Feb 8, 2022
1 parent 2e5dc27 commit f4434cf
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 53 deletions.
41 changes: 22 additions & 19 deletions lite/backends/opencl/cl_kernel/image/pool_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ __kernel void pool(__read_only image2d_t input,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left,
__private const int exclusive,
__private const int adaptive) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
Expand All @@ -37,8 +38,8 @@ __kernel void pool(__read_only image2d_t input,
int pool_size = 1;
if (adaptive == 1) {
start_h = floor((out_h * in_height) / (float)out_height);
end_h = ceil(((out_h + 1) * in_height) / (float)out_height);
start_w = floor((out_w * in_width) / (float)out_width);
end_h = ceil(((out_h + 1) * in_height) / (float)out_height);
end_w = ceil(((out_w + 1) * in_width) / (float)out_width);
} else {
start_h = out_h * stride_h - pad_top;
Expand Down Expand Up @@ -67,10 +68,7 @@ __kernel void pool(__read_only image2d_t input,
}
}

#ifdef EXCLUSIVE
pool_size = (end_h - start_h) * (end_w - start_w);
#endif // EXCLUSIVE
if (adaptive == 1) {
if (exclusive == 1 || adaptive == 1) {
pool_size = (end_h - start_h) * (end_w - start_w);
}

Expand Down Expand Up @@ -110,6 +108,8 @@ __kernel void pool_local(__read_only image2d_t input,
__private const int stride_w,
__private const int pad_top,
__private const int pad_left,
__private const int exclusive,
__private const int adaptive,
__private const int local_block_size,
__private const int2 local_block_size_wh,
__private const int2 local_block_count_wh,
Expand All @@ -119,7 +119,7 @@ __kernel void pool_local(__read_only image2d_t input,
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_height;
// const int out_h = out_nh % out_height;
const int out_h = out_nh - mul24(out_h, out_height);
const int out_h = out_nh - mul24(out_n, out_height);

const int local_id = get_local_id(0);
const int local_width_id = local_id % local_block_size_wh.x;
Expand All @@ -131,6 +131,7 @@ __kernel void pool_local(__read_only image2d_t input,
const int input_width_start = mad24(out_w, stride_w, -pad_left);

#ifdef POOL_AVG
// 1. Get data from global memroy to local memory
__local float4* avg_output = (__local float4*)local_output;
avg_output[local_id] = (float4)0;
int pos_h = local_height_id;
Expand Down Expand Up @@ -161,6 +162,7 @@ __kernel void pool_local(__read_only image2d_t input,
}
barrier(CLK_LOCAL_MEM_FENCE);

// 2. Reduce in each workgroup
for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0;
stride_h >>= 1) {
if (local_height_id < stride_h) {
Expand All @@ -169,7 +171,6 @@ __kernel void pool_local(__read_only image2d_t input,
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0;
stride_w >>= 1) {
if (local_height_id == 0 && local_width_id < stride_w) {
Expand All @@ -179,16 +180,18 @@ __kernel void pool_local(__read_only image2d_t input,
}

if (local_id == 0) {
const int kernel_height_start = max(0, input_height_start);
const int kernel_width_start = max(0, input_width_start);
const int kernel_height_end = min(input_height_start + ksize_h, in_height);
const int kernel_width_end = min(input_width_start + ksize_w, in_width);
#ifdef EXCLUSIVE
const int block_size = mul24((kernel_height_end - kernel_height_start),
(kernel_width_end - kernel_width_start));
#else
const int block_size = ksize_w * ksize_h;
#endif // EXCLUSIVE
int block_size;
if (exclusive == 1 || adaptive == 1) {
const int kernel_height_start = max(0, input_height_start);
const int kernel_width_start = max(0, input_width_start);
const int kernel_height_end =
min(input_height_start + ksize_h, in_height);
const int kernel_width_end = min(input_width_start + ksize_w, in_width);
block_size = mul24((kernel_height_end - kernel_height_start),
(kernel_width_end - kernel_width_start));
} else {
block_size = ksize_w * ksize_h;
}
avg_output[local_id] = avg_output[local_id] / (float)block_size;

const int output_channel_width_idx = mad24(out_c, out_width, out_w);
Expand All @@ -201,6 +204,7 @@ __kernel void pool_local(__read_only image2d_t input,
CL_DTYPE_CHAR, output, (int2)(output_channel_width_idx, out_nh), res);
}
#else
// 1. Get data from global memroy to local memory
local_output[local_id] = (CL_DTYPE4)(-FLT_MAX);
int pos_h = local_height_id;

Expand Down Expand Up @@ -236,9 +240,9 @@ __kernel void pool_local(__read_only image2d_t input,
}
pos_h += local_block_size_wh.y;
}

barrier(CLK_LOCAL_MEM_FENCE);

// 2. Reduce in each workgroup
for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0;
stride_h >>= 1) {
if (local_height_id < stride_h) {
Expand All @@ -248,7 +252,6 @@ __kernel void pool_local(__read_only image2d_t input,
}
barrier(CLK_LOCAL_MEM_FENCE);
}

for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0;
stride_w >>= 1) {
if (local_height_id == 0 && local_width_id < stride_w) {
Expand Down
70 changes: 36 additions & 34 deletions lite/kernels/opencl/pool_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
const auto& in_dims = param.x->dims();
const auto& out_dims = param.output->dims();
const bool global_pooling = param.global_pooling;
const bool exclusive = param.exclusive;
const bool adaptive = param.adaptive;
const std::string padding_algorithm = param.padding_algorithm;
const std::vector<int>& ksize = param.ksize;
Expand Down Expand Up @@ -66,9 +65,6 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
param.strides,
ksize);

if (exclusive) {
build_options_ += " -DEXCLUSIVE";
}
if (global_pooling) {
build_options_ += " -DGLOBAL";
ksize_.resize(static_cast<size_t>(in_dims.size()) - 2);
Expand All @@ -81,8 +77,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
run_local_work_ =
out_dims[0] * UP_DIV(out_dims[1], 4) * out_dims[2] * out_dims[3] <
low_op_parallelism_thre_ &&
ksize_[0] * ksize_[1] >= high_op_intensity_thre_;
run_local_work_ = false;
ksize_[0] * ksize_[1] >= high_op_intensity_thre_ && !adaptive;
if (run_local_work_) {
kernel_func_name_ += "_local";
}
Expand Down Expand Up @@ -154,39 +149,40 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
}

const int out_c_blks = UP_DIV(out_dims[1], 4);
uint32_t workgroup_size = 0;

int type_size =
(CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16)
? sizeof(uint16_t)
: sizeof(float);
if (pooling_type == "avg") {
type_size = sizeof(float);
}
uint32_t local_mem_size =
CLRuntime::Global()->GetDeviceInfo()["CL_DEVICE_LOCAL_MEM_SIZE_KB"] *
1024;
uint32_t workgroupsize_max =
CLRuntime::Global()->GetMaxWorkGroupSize(kernel_);

uint32_t compute_intensity = ksize_[0] * ksize_[1];
run_local_work_ = out_dims[0] * out_c_blks * out_dims[2] * out_dims[3] <
low_op_parallelism_thre_ &&
compute_intensity >= high_op_intensity_thre_;
run_local_work_ = false;
compute_intensity >= high_op_intensity_thre_ &&
!adaptive;
if (run_local_work_) {
workgroup_size =
// Calculate workgroup_w_size, workgroup_h_size
int type_size = (CLRuntime::Global()->get_precision() ==
lite_api::CL_PRECISION_FP16)
? sizeof(uint16_t)
: sizeof(float);
if (pooling_type == "avg") {
type_size = sizeof(float);
}
uint32_t local_mem_size =
CLRuntime::Global()
->GetDeviceInfo()["CL_DEVICE_LOCAL_MEM_SIZE_KB"] *
1024;
uint32_t workgroupsize_max =
CLRuntime::Global()->GetMaxWorkGroupSize(kernel_);
uint32_t workgroup_size =
std::min(static_cast<uint32_t>(local_mem_size / (4 * type_size)),
workgroupsize_max);
workgroup_size =
std::min(static_cast<uint32_t>(compute_intensity), workgroup_size);

// make workgroup_size floor-round to pow(2)
uint32_t temp_size = 1;
while ((temp_size <<= 1) <= workgroup_size) {
}
workgroup_size = temp_size >> 1;

// make workgroup_w_size floor-round to pow(2)
int workgroup_w_size = 1, workgroup_h_size;
while ((workgroup_w_size <<= 1) <= ksize_[0] &&
while ((workgroup_w_size <<= 1) <= ksize_[1] &&
workgroup_w_size <= workgroup_size) {
}
workgroup_w_size >>= 1;
Expand All @@ -198,14 +194,21 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
local_work_size_ = cl::NDRange(workgroup_size, 1, 1);

cl_int2 local_block_size_shape = {workgroup_w_size, workgroup_h_size};
cl_int2 local_block_count_shape = {UP_DIV(ksize_[0], workgroup_w_size),
UP_DIV(ksize_[1], workgroup_h_size)};
cl_int2 local_block_count_shape = {UP_DIV(ksize_[1], workgroup_w_size),
UP_DIV(ksize_[0], workgroup_h_size)};

int idx = 12;
int idx = 14;
kernel_.setArg(idx++, static_cast<int>(workgroup_size));
kernel_.setArg(idx++, local_block_size_shape);
kernel_.setArg(idx++, local_block_count_shape);
kernel_.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
#ifdef LITE_WITH_LOG
VLOG(4) << "workgroup_size: " << workgroup_size;
VLOG(4) << "local_block_size_shape(wh): " << local_block_size_shape.x
<< " " << local_block_size_shape.y;
VLOG(4) << "local_block_count_shape(wh): " << local_block_count_shape.x
<< " " << local_block_count_shape.y;
#endif
} else {
global_work_size_ =
cl::NDRange(out_c_blks, out_dims[3], out_dims[0] * out_dims[2]);
Expand Down Expand Up @@ -234,11 +237,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
CL_CHECK_FATAL(status);
status = kernel_.setArg(arg_idx++, paddings_[2]);
CL_CHECK_FATAL(status);
if (kernel_func_name_ == "pool") {
int ad = param.adaptive;
status = kernel_.setArg(arg_idx++, ad);
CL_CHECK_FATAL(status);
}
status = kernel_.setArg(arg_idx++, static_cast<int>(exclusive));
CL_CHECK_FATAL(status);
status = kernel_.setArg(arg_idx++, static_cast<int>(adaptive));
CL_CHECK_FATAL(status);

#ifdef LITE_WITH_LOG
const std::vector<int>& paddings = *param.paddings;
Expand Down

0 comments on commit f4434cf

Please sign in to comment.