Merge pull request #17372 from jenshannoschwalm/ce_opencl

Color equalizer OpenCL implementation
darktable-org · Aug 27, 2024 · 28f3414 · 28f3414
2 parents 80f8355 + 8468c73
commit 28f3414
Show file tree

Hide file tree

Showing 12 changed files with 1,279 additions and 81 deletions.
diff --git a/data/kernels/basic.cl b/data/kernels/basic.cl
@@ -971,7 +971,7 @@ interpolate_bilinear(read_only image2d_t in,
                      write_only image2d_t out,
                      const int width_out,
                      const int height_out,
-                     const int RGBa)
+                     const int ch) // works with 1-4 channels
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);

diff --git a/data/kernels/colorequal.cl b/data/kernels/colorequal.cl
diff --git a/data/kernels/colorspace.h b/data/kernels/colorspace.h
@@ -944,3 +944,50 @@ static inline float4 dt_UCS_HCB_to_JCH(const float4 HCB)
   JCH.x = HCB.z / (native_powr(HCB.y, 1.33654221029386f) + 1.f);
   return JCH;
 }
+
+static inline float4 dt_UCS_HSB_to_XYZ(const float4 HSB, const float L_w)
+{
+  const float4 JCH = dt_UCS_HSB_to_JCH(HSB);
+  const float4 xyY = dt_UCS_JCH_to_xyY(JCH, L_w);
+  return dt_xyY_to_XYZ(xyY);
+}
+
+static inline float4 dt_UCS_LUV_to_JCH(const float L_star, const float L_white, const float4 UV_star_prime)
+{
+  const float M2 = UV_star_prime.x * UV_star_prime.x + UV_star_prime.y * UV_star_prime.y; // square of colorfulness M
+  const float4 JCH = {  L_star / L_white,
+                        15.932993652962535f * native_powr(L_star, 0.6523997524738018f) * native_powr(M2, 0.6007557017508491f) / L_white,
+                        atan2(UV_star_prime.y, UV_star_prime.x),
+                        0.0f };
+  return JCH;
+ }
+
+#define LUT_ELEM 360 // gamut LUT number of elements: resolution of 1°
+static inline float lookup_gamut(global const float *gamut_lut, const float x)
+{
+  // WARNING : x should be between [-pi ; pi ], which is the default output of atan2 anyway
+
+  // convert in LUT coordinate
+  const float x_test = (LUT_ELEM - 1) * (x + M_PI_F) / (2.f * M_PI_F);
+
+  // find the 2 closest integer coordinates (next/previous)
+  float x_prev = floor(x_test);
+  float x_next = ceil(x_test);
+
+  // get the 2 closest LUT elements at integer coordinates
+  // cycle on the hue ring if out of bounds
+  int xi = (int)x_prev;
+  if(xi < 0) xi = LUT_ELEM - 1;
+  else if(xi > LUT_ELEM - 1) xi = 0;
+
+  int xii = (int)x_next;
+  if(xii < 0) xii = LUT_ELEM - 1;
+  else if(xii > LUT_ELEM - 1) xii = 0;
+
+  // fetch the corresponding y values
+  const float y_prev = gamut_lut[xi];
+
+  // return y_prev if we are on the same integer LUT element or do linear interpolation
+  return y_prev + ((xi != xii) ? (x_test - x_prev) * (gamut_lut[xii] - y_prev) : 0.0f);
+}
+
diff --git a/data/kernels/extended.cl b/data/kernels/extended.cl
@@ -747,38 +747,6 @@ static inline float4 opacity_masks(const float x,
   return output;
 }
 
-
-#define LUT_ELEM 360 // gamut LUT number of elements: resolution of 1°
-
-static inline float lookup_gamut(global const float *gamut_lut, const float x)
-{
-  // WARNING : x should be between [-pi ; pi ], which is the default output of atan2 anyway
-
-  // convert in LUT coordinate
-  const float x_test = (LUT_ELEM - 1) * (x + M_PI_F) / (2.f * M_PI_F);
-
-  // find the 2 closest integer coordinates (next/previous)
-  float x_prev = floor(x_test);
-  float x_next = ceil(x_test);
-
-  // get the 2 closest LUT elements at integer coordinates
-  // cycle on the hue ring if out of bounds
-  int xi = (int)x_prev;
-  if(xi < 0) xi = LUT_ELEM - 1;
-  else if(xi > LUT_ELEM - 1) xi = 0;
-
-  int xii = (int)x_next;
-  if(xii < 0) xii = LUT_ELEM - 1;
-  else if(xii > LUT_ELEM - 1) xii = 0;
-
-  // fetch the corresponding y values
-  const float y_prev = gamut_lut[xi];
-
-  // return y_prev if we are on the same integer LUT element or do linear interpolation
-  return y_prev + ((xi != xii) ? (x_test - x_prev) * (gamut_lut[xii] - y_prev) : 0.0f);
-}
-
-
 typedef enum dt_iop_colorbalancrgb_saturation_t
 {
   DT_COLORBALANCE_SATURATION_JZAZBZ = 0, // $DESCRIPTION: "JzAzBz (2021)"

diff --git a/data/kernels/gaussian.cl b/data/kernels/gaussian.cl
@@ -24,8 +24,8 @@
    is needed to have read-write access to some buffers which openCL does not offer for image object. */
 
 
-kernel void 
-gaussian_transpose_4c(global float4 *in, global float4 *out, unsigned int width, unsigned int height, 
+kernel void
+gaussian_transpose_4c(global float4 *in, global float4 *out, unsigned int width, unsigned int height,
                       unsigned int blocksize, local float4 *buffer)
 {
   unsigned int x = get_global_id(0);
@@ -49,9 +49,34 @@ gaussian_transpose_4c(global float4 *in, global float4 *out, unsigned int width,
   }
 }
 
+kernel void
+gaussian_transpose_2c(global float2 *in, global float2 *out, unsigned int width, unsigned int height,
+                      unsigned int blocksize, local float2 *buffer)
+{
+  unsigned int x = get_global_id(0);
+  unsigned int y = get_global_id(1);
+
+  if((x < width) && (y < height))
+  {
+    const unsigned int iindex = mad24(y, width, x);
+    buffer[mad24(get_local_id(1), blocksize + 1, get_local_id(0))] = in[iindex];
+  }
 
-kernel void 
-gaussian_transpose_1c(global float *in, global float *out, unsigned int width, unsigned int height, 
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  x = mad24(get_group_id(1), blocksize, get_local_id(0));
+  y = mad24(get_group_id(0), blocksize, get_local_id(1));
+
+  if((x < height) && (y < width))
+  {
+    const unsigned int oindex = mad24(y, height, x);
+    out[oindex] = buffer[mad24(get_local_id(0), blocksize + 1, get_local_id(1))];
+  }
+}
+
+
+kernel void
+gaussian_transpose_1c(global float *in, global float *out, unsigned int width, unsigned int height,
                       unsigned int blocksize, local float *buffer)
 {
   unsigned int x = get_global_id(0);
@@ -76,7 +101,7 @@ gaussian_transpose_1c(global float *in, global float *out, unsigned int width, u
 }
 
 
-kernel void 
+kernel void
 gaussian_column_4c(global float4 *in, global float4 *out, unsigned int width, unsigned int height,
                   const float a0, const float a1, const float a2, const float a3, const float b1, const float b2,
                   const float coefp, const float coefn, const float4 Labmax, const float4 Labmin)
@@ -100,7 +125,6 @@ gaussian_column_4c(global float4 *in, global float4 *out, unsigned int width, un
   yb = xp * coefp;
   yp = yb;
 
-
   for(int y=0; y<height; y++)
   {
     const int idx = mad24((unsigned int)y, width, x);
@@ -113,7 +137,6 @@ gaussian_column_4c(global float4 *in, global float4 *out, unsigned int width, un
     yp = yc;
 
     out[idx] = yc;
-
   }
 
   // backward filter
@@ -122,25 +145,83 @@ gaussian_column_4c(global float4 *in, global float4 *out, unsigned int width, un
   yn = xn * coefn;
   ya = yn;
 
-
   for(int y=height-1; y>-1; y--)
   {
     const int idx = mad24((unsigned int)y, width, x);
 
     xc = clamp(in[idx], Labmin, Labmax);
     yc = (a2 * xn) + (a3 * xa) - (b1 * yn) - (b2 * ya);
 
-    xa = xn; 
-    xn = xc; 
-    ya = yn; 
+    xa = xn;
+    xn = xc;
+    ya = yn;
     yn = yc;
 
     out[idx] += yc;
+  }
+}
+
+kernel void
+gaussian_column_2c(global float2 *in, global float2 *out, unsigned int width, unsigned int height,
+                  const float a0, const float a1, const float a2, const float a3, const float b1, const float b2,
+                  const float coefp, const float coefn, const float2 Labmax, const float2 Labmin)
+{
+  const unsigned int x = get_global_id(0);
+
+  if(x >= width) return;
+
+  float2 xp = (float2)0.0f;
+  float2 yb = (float2)0.0f;
+  float2 yp = (float2)0.0f;
+  float2 xc = (float2)0.0f;
+  float2 yc = (float2)0.0f;
+  float2 xn = (float2)0.0f;
+  float2 xa = (float2)0.0f;
+  float2 yn = (float2)0.0f;
+  float2 ya = (float2)0.0f;
+
+  // forward filter
+  xp = clamp(in[x], Labmin, Labmax); // 0*width+x
+  yb = xp * coefp;
+  yp = yb;
+
+  for(int y=0; y<height; y++)
+  {
+    const int idx = mad24((unsigned int)y, width, x);
+
+    xc = clamp(in[idx], Labmin, Labmax);
+    yc = (a0 * xc) + (a1 * xp) - (b1 * yp) - (b2 * yb);
+
+    xp = xc;
+    yb = yp;
+    yp = yc;
 
+    out[idx] = yc;
+  }
+
+  // backward filter
+  xn = clamp(in[mad24(height - 1, width, x)], Labmin, Labmax);
+  xa = xn;
+  yn = xn * coefn;
+  ya = yn;
+
+  for(int y=height-1; y>-1; y--)
+  {
+    const int idx = mad24((unsigned int)y, width, x);
+
+    xc = clamp(in[idx], Labmin, Labmax);
+    yc = (a2 * xn) + (a3 * xa) - (b1 * yn) - (b2 * ya);
+
+    xa = xn;
+    xn = xc;
+    ya = yn;
+    yn = yc;
+
+    out[idx] += yc;
   }
 }
 
-kernel void 
+kernel void
 gaussian_column_1c(global float *in, global float *out, unsigned int width, unsigned int height,
                   const float a0, const float a1, const float a2, const float a3, const float b1, const float b2,
                   const float coefp, const float coefn, const float Labmax, const float Labmin)
@@ -164,7 +245,6 @@ gaussian_column_1c(global float *in, global float *out, unsigned int width, unsi
   yb = xp * coefp;
   yp = yb;
 
-
   for(int y=0; y<height; y++)
   {
     const int idx = mad24((unsigned int)y, width, x);
@@ -177,7 +257,6 @@ gaussian_column_1c(global float *in, global float *out, unsigned int width, unsi
     yp = yc;
 
     out[idx] = yc;
-
   }
 
   // backward filter
@@ -194,13 +273,12 @@ gaussian_column_1c(global float *in, global float *out, unsigned int width, unsi
     xc = clamp(in[idx], Labmin, Labmax);
     yc = (a2 * xn) + (a3 * xa) - (b1 * yn) - (b2 * ya);
 
-    xa = xn; 
-    xn = xc; 
-    ya = yn; 
+    xa = xn;
+    xn = xc;
+    ya = yn;
     yn = yc;
 
     out[idx] += yc;
-
   }
 }
 
@@ -225,8 +303,8 @@ lookup_unbounded(read_only image2d_t lut, const float x, global float *a)
 }
 
 
-kernel void 
-lowpass_mix(read_only image2d_t in, write_only image2d_t out, unsigned int width, unsigned int height, const float saturation, 
+kernel void
+lowpass_mix(read_only image2d_t in, write_only image2d_t out, unsigned int width, unsigned int height, const float saturation,
             read_only image2d_t ctable, global float *ca, read_only image2d_t ltable, global float *la, const int unbound)
 {
   const unsigned int x = get_global_id(0);
@@ -307,11 +385,11 @@ overlay(const float4 in_a, const float4 in_b, const float opacity, const float t
 #define UNBOUND_HIGHLIGHTS_A   (UNBOUND_A << 3)   /* 16 */
 #define UNBOUND_HIGHLIGHTS_B   (UNBOUND_B << 3)   /* 32 */
 
-kernel void 
-shadows_highlights_mix(read_only image2d_t in, read_only image2d_t mask, write_only image2d_t out, 
-                       unsigned int width, unsigned int height, 
+kernel void
+shadows_highlights_mix(read_only image2d_t in, read_only image2d_t mask, write_only image2d_t out,
+                       unsigned int width, unsigned int height,
                        const float shadows, const float highlights, const float compress,
-                       const float shadows_ccorrect, const float highlights_ccorrect, 
+                       const float shadows_ccorrect, const float highlights_ccorrect,
                        const unsigned int flags, const int unbound_mask, const float low_approximation,
                        const float whitepoint)
 {

diff --git a/data/kernels/programs.conf b/data/kernels/programs.conf
@@ -37,3 +37,4 @@ diffuse.cl              33
 blurs.cl                34
 bspline.cl              35
 sigmoid.cl              36
+colorequal.cl           37
diff --git a/src/common/gaussian.c b/src/common/gaussian.c
@@ -473,6 +473,8 @@ dt_gaussian_cl_global_t *dt_gaussian_init_cl_global()
   const int program = 6; // gaussian.cl, from programs.conf
   g->kernel_gaussian_column_1c = dt_opencl_create_kernel(program, "gaussian_column_1c");
   g->kernel_gaussian_transpose_1c = dt_opencl_create_kernel(program, "gaussian_transpose_1c");
+  g->kernel_gaussian_column_2c = dt_opencl_create_kernel(program, "gaussian_column_2c");
+  g->kernel_gaussian_transpose_2c = dt_opencl_create_kernel(program, "gaussian_transpose_2c");
   g->kernel_gaussian_column_4c = dt_opencl_create_kernel(program, "gaussian_column_4c");
   g->kernel_gaussian_transpose_4c = dt_opencl_create_kernel(program, "gaussian_transpose_4c");
   return g;
@@ -501,9 +503,9 @@ dt_gaussian_cl_t *dt_gaussian_init_cl(const int devid,
                                       const float sigma,  // gaussian sigma
                                       const int order)    // order of gaussian blur
 {
-  assert(channels == 1 || channels == 4);
+  assert(channels == 1 || channels == 2 || channels == 4);
 
-  if(!(channels == 1 || channels == 4)) return NULL;
+  if(!(channels == 1 || channels == 2 || channels == 4)) return NULL;
 
   dt_gaussian_cl_t *g = (dt_gaussian_cl_t *)malloc(sizeof(dt_gaussian_cl_t));
   if(!g) return NULL;
@@ -529,7 +531,8 @@ dt_gaussian_cl_t *dt_gaussian_init_cl(const int devid,
   }
 
   int kernel_gaussian_transpose = (channels == 1) ? g->global->kernel_gaussian_transpose_1c
-                                                  : g->global->kernel_gaussian_transpose_4c;
+                               : ((channels == 2) ? g->global->kernel_gaussian_transpose_2c
+                                                  : g->global->kernel_gaussian_transpose_4c);
   int blocksize;
 
   dt_opencl_local_buffer_t locopt
@@ -602,6 +605,11 @@ cl_int dt_gaussian_blur_cl(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev_out)
     kernel_gaussian_column = g->global->kernel_gaussian_column_1c;
     kernel_gaussian_transpose = g->global->kernel_gaussian_transpose_1c;
   }
+  else if(channels == 2)
+  {
+    kernel_gaussian_column = g->global->kernel_gaussian_column_2c;
+    kernel_gaussian_transpose = g->global->kernel_gaussian_transpose_2c;
+  }
   else if(channels == 4)
   {
     kernel_gaussian_column = g->global->kernel_gaussian_column_4c;
@@ -689,6 +697,8 @@ void dt_gaussian_free_cl_global(dt_gaussian_cl_global_t *g)
   // destroy kernels
   dt_opencl_free_kernel(g->kernel_gaussian_column_1c);
   dt_opencl_free_kernel(g->kernel_gaussian_transpose_1c);
+  dt_opencl_free_kernel(g->kernel_gaussian_column_2c);
+  dt_opencl_free_kernel(g->kernel_gaussian_transpose_2c);
   dt_opencl_free_kernel(g->kernel_gaussian_column_4c);
   dt_opencl_free_kernel(g->kernel_gaussian_transpose_4c);
   free(g);

diff --git a/src/common/gaussian.h b/src/common/gaussian.h
@@ -61,6 +61,7 @@ void dt_gaussian_free(dt_gaussian_t *g);
 typedef struct dt_gaussian_cl_global_t
 {
   int kernel_gaussian_column_4c, kernel_gaussian_transpose_4c;
+  int kernel_gaussian_column_2c, kernel_gaussian_transpose_2c;
   int kernel_gaussian_column_1c, kernel_gaussian_transpose_1c;
 } dt_gaussian_cl_global_t;