[OpenCL] Fix pool_local kernel (#8397)

PaddlePaddle · Feb 8, 2022 · f4434cf · f4434cf
1 parent 2e5dc27
commit f4434cf
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 53 deletions.
diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
@@ -26,6 +26,7 @@ __kernel void pool(__read_only image2d_t input,
                    __private const int stride_w,
                    __private const int pad_top,
                    __private const int pad_left,
+                   __private const int exclusive,
                    __private const int adaptive) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
@@ -37,8 +38,8 @@ __kernel void pool(__read_only image2d_t input,
   int pool_size = 1;
   if (adaptive == 1) {
     start_h = floor((out_h * in_height) / (float)out_height);
-    end_h = ceil(((out_h + 1) * in_height) / (float)out_height);
     start_w = floor((out_w * in_width) / (float)out_width);
+    end_h = ceil(((out_h + 1) * in_height) / (float)out_height);
     end_w = ceil(((out_w + 1) * in_width) / (float)out_width);
   } else {
     start_h = out_h * stride_h - pad_top;
@@ -67,10 +68,7 @@ __kernel void pool(__read_only image2d_t input,
     }
   }
 
-#ifdef EXCLUSIVE
-  pool_size = (end_h - start_h) * (end_w - start_w);
-#endif  // EXCLUSIVE
-  if (adaptive == 1) {
+  if (exclusive == 1 || adaptive == 1) {
     pool_size = (end_h - start_h) * (end_w - start_w);
   }
 
@@ -110,6 +108,8 @@ __kernel void pool_local(__read_only image2d_t input,
                          __private const int stride_w,
                          __private const int pad_top,
                          __private const int pad_left,
+                         __private const int exclusive,
+                         __private const int adaptive,
                          __private const int local_block_size,
                          __private const int2 local_block_size_wh,
                          __private const int2 local_block_count_wh,
@@ -119,7 +119,7 @@ __kernel void pool_local(__read_only image2d_t input,
   const int out_nh = get_global_id(2);
   const int out_n = out_nh / out_height;
   // const int out_h = out_nh % out_height;
-  const int out_h = out_nh - mul24(out_h, out_height);
+  const int out_h = out_nh - mul24(out_n, out_height);
 
   const int local_id = get_local_id(0);
   const int local_width_id = local_id % local_block_size_wh.x;
@@ -131,6 +131,7 @@ __kernel void pool_local(__read_only image2d_t input,
   const int input_width_start = mad24(out_w, stride_w, -pad_left);
 
 #ifdef POOL_AVG
+  // 1. Get data from global memroy to local memory
   __local float4* avg_output = (__local float4*)local_output;
   avg_output[local_id] = (float4)0;
   int pos_h = local_height_id;
@@ -161,6 +162,7 @@ __kernel void pool_local(__read_only image2d_t input,
   }
   barrier(CLK_LOCAL_MEM_FENCE);
 
+  // 2. Reduce in each workgroup
   for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0;
        stride_h >>= 1) {
     if (local_height_id < stride_h) {
@@ -169,7 +171,6 @@ __kernel void pool_local(__read_only image2d_t input,
     }
     barrier(CLK_LOCAL_MEM_FENCE);
   }
-
   for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0;
        stride_w >>= 1) {
     if (local_height_id == 0 && local_width_id < stride_w) {
@@ -179,16 +180,18 @@ __kernel void pool_local(__read_only image2d_t input,
   }
 
   if (local_id == 0) {
-    const int kernel_height_start = max(0, input_height_start);
-    const int kernel_width_start = max(0, input_width_start);
-    const int kernel_height_end = min(input_height_start + ksize_h, in_height);
-    const int kernel_width_end = min(input_width_start + ksize_w, in_width);
-#ifdef EXCLUSIVE
-    const int block_size = mul24((kernel_height_end - kernel_height_start),
-                                 (kernel_width_end - kernel_width_start));
-#else
-    const int block_size = ksize_w * ksize_h;
-#endif  // EXCLUSIVE
+    int block_size;
+    if (exclusive == 1 || adaptive == 1) {
+      const int kernel_height_start = max(0, input_height_start);
+      const int kernel_width_start = max(0, input_width_start);
+      const int kernel_height_end =
+          min(input_height_start + ksize_h, in_height);
+      const int kernel_width_end = min(input_width_start + ksize_w, in_width);
+      block_size = mul24((kernel_height_end - kernel_height_start),
+                         (kernel_width_end - kernel_width_start));
+    } else {
+      block_size = ksize_w * ksize_h;
+    }
     avg_output[local_id] = avg_output[local_id] / (float)block_size;
 
     const int output_channel_width_idx = mad24(out_c, out_width, out_w);
@@ -201,6 +204,7 @@ __kernel void pool_local(__read_only image2d_t input,
         CL_DTYPE_CHAR, output, (int2)(output_channel_width_idx, out_nh), res);
   }
 #else
+  // 1. Get data from global memroy to local memory
   local_output[local_id] = (CL_DTYPE4)(-FLT_MAX);
   int pos_h = local_height_id;
 
@@ -236,9 +240,9 @@ __kernel void pool_local(__read_only image2d_t input,
     }
     pos_h += local_block_size_wh.y;
   }
-
   barrier(CLK_LOCAL_MEM_FENCE);
 
+  // 2. Reduce in each workgroup
   for (int stride_h = (local_block_size_wh.y >> 1); stride_h > 0;
        stride_h >>= 1) {
     if (local_height_id < stride_h) {
@@ -248,7 +252,6 @@ __kernel void pool_local(__read_only image2d_t input,
     }
     barrier(CLK_LOCAL_MEM_FENCE);
   }
-
   for (int stride_w = (local_block_size_wh.x >> 1); stride_w > 0;
        stride_w >>= 1) {
     if (local_height_id == 0 && local_width_id < stride_w) {

diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc
@@ -38,7 +38,6 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     const auto& in_dims = param.x->dims();
     const auto& out_dims = param.output->dims();
     const bool global_pooling = param.global_pooling;
-    const bool exclusive = param.exclusive;
     const bool adaptive = param.adaptive;
     const std::string padding_algorithm = param.padding_algorithm;
     const std::vector<int>& ksize = param.ksize;
@@ -66,9 +65,6 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
                              param.strides,
                              ksize);
 
-    if (exclusive) {
-      build_options_ += " -DEXCLUSIVE";
-    }
     if (global_pooling) {
       build_options_ += " -DGLOBAL";
       ksize_.resize(static_cast<size_t>(in_dims.size()) - 2);
@@ -81,8 +77,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
     run_local_work_ =
         out_dims[0] * UP_DIV(out_dims[1], 4) * out_dims[2] * out_dims[3] <
             low_op_parallelism_thre_ &&
-        ksize_[0] * ksize_[1] >= high_op_intensity_thre_;
-    run_local_work_ = false;
+        ksize_[0] * ksize_[1] >= high_op_intensity_thre_ && !adaptive;
     if (run_local_work_) {
       kernel_func_name_ += "_local";
     }
@@ -154,39 +149,40 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
       }
 
       const int out_c_blks = UP_DIV(out_dims[1], 4);
-      uint32_t workgroup_size = 0;
-
-      int type_size =
-          (CLRuntime::Global()->get_precision() == lite_api::CL_PRECISION_FP16)
-              ? sizeof(uint16_t)
-              : sizeof(float);
-      if (pooling_type == "avg") {
-        type_size = sizeof(float);
-      }
-      uint32_t local_mem_size =
-          CLRuntime::Global()->GetDeviceInfo()["CL_DEVICE_LOCAL_MEM_SIZE_KB"] *
-          1024;
-      uint32_t workgroupsize_max =
-          CLRuntime::Global()->GetMaxWorkGroupSize(kernel_);
-
       uint32_t compute_intensity = ksize_[0] * ksize_[1];
       run_local_work_ = out_dims[0] * out_c_blks * out_dims[2] * out_dims[3] <
                             low_op_parallelism_thre_ &&
-                        compute_intensity >= high_op_intensity_thre_;
-      run_local_work_ = false;
+                        compute_intensity >= high_op_intensity_thre_ &&
+                        !adaptive;
       if (run_local_work_) {
-        workgroup_size =
+        // Calculate workgroup_w_size, workgroup_h_size
+        int type_size = (CLRuntime::Global()->get_precision() ==
+                         lite_api::CL_PRECISION_FP16)
+                            ? sizeof(uint16_t)
+                            : sizeof(float);
+        if (pooling_type == "avg") {
+          type_size = sizeof(float);
+        }
+        uint32_t local_mem_size =
+            CLRuntime::Global()
+                ->GetDeviceInfo()["CL_DEVICE_LOCAL_MEM_SIZE_KB"] *
+            1024;
+        uint32_t workgroupsize_max =
+            CLRuntime::Global()->GetMaxWorkGroupSize(kernel_);
+        uint32_t workgroup_size =
             std::min(static_cast<uint32_t>(local_mem_size / (4 * type_size)),
                      workgroupsize_max);
         workgroup_size =
             std::min(static_cast<uint32_t>(compute_intensity), workgroup_size);
+
+        // make workgroup_size floor-round to pow(2)
         uint32_t temp_size = 1;
         while ((temp_size <<= 1) <= workgroup_size) {
         }
         workgroup_size = temp_size >> 1;
-
+        // make workgroup_w_size floor-round to pow(2)
         int workgroup_w_size = 1, workgroup_h_size;
-        while ((workgroup_w_size <<= 1) <= ksize_[0] &&
+        while ((workgroup_w_size <<= 1) <= ksize_[1] &&
                workgroup_w_size <= workgroup_size) {
         }
         workgroup_w_size >>= 1;
@@ -198,14 +194,21 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
         local_work_size_ = cl::NDRange(workgroup_size, 1, 1);
 
         cl_int2 local_block_size_shape = {workgroup_w_size, workgroup_h_size};
-        cl_int2 local_block_count_shape = {UP_DIV(ksize_[0], workgroup_w_size),
-                                           UP_DIV(ksize_[1], workgroup_h_size)};
+        cl_int2 local_block_count_shape = {UP_DIV(ksize_[1], workgroup_w_size),
+                                           UP_DIV(ksize_[0], workgroup_h_size)};
 
-        int idx = 12;
+        int idx = 14;
         kernel_.setArg(idx++, static_cast<int>(workgroup_size));
         kernel_.setArg(idx++, local_block_size_shape);
         kernel_.setArg(idx++, local_block_count_shape);
         kernel_.setArg(idx++, workgroup_size * 4 * type_size, nullptr);
+#ifdef LITE_WITH_LOG
+        VLOG(4) << "workgroup_size: " << workgroup_size;
+        VLOG(4) << "local_block_size_shape(wh): " << local_block_size_shape.x
+                << " " << local_block_size_shape.y;
+        VLOG(4) << "local_block_count_shape(wh): " << local_block_count_shape.x
+                << " " << local_block_count_shape.y;
+#endif
       } else {
         global_work_size_ =
             cl::NDRange(out_c_blks, out_dims[3], out_dims[0] * out_dims[2]);
@@ -234,11 +237,10 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
       CL_CHECK_FATAL(status);
       status = kernel_.setArg(arg_idx++, paddings_[2]);
       CL_CHECK_FATAL(status);
-      if (kernel_func_name_ == "pool") {
-        int ad = param.adaptive;
-        status = kernel_.setArg(arg_idx++, ad);
-        CL_CHECK_FATAL(status);
-      }
+      status = kernel_.setArg(arg_idx++, static_cast<int>(exclusive));
+      CL_CHECK_FATAL(status);
+      status = kernel_.setArg(arg_idx++, static_cast<int>(adaptive));
+      CL_CHECK_FATAL(status);
 
 #ifdef LITE_WITH_LOG
       const std::vector<int>& paddings = *param.paddings;