From 0f06ca3c656c6f860dec576106de09e4e9faa1d6 Mon Sep 17 00:00:00 2001 From: George Hotz <72895+geohot@users.noreply.github.com> Date: Fri, 18 Feb 2022 20:15:20 -0800 Subject: [PATCH] camerad: fast debayer on c2 cameras (#23795) * fast debayer on c2 dcam * add casts * 128 local worksize on HDR debayer, 8 ms -> 3.5 ms * width instead of saving rgb_width Co-authored-by: Comma Device --- selfdrive/camerad/cameras/camera_common.cc | 19 ++++++++++++++++--- selfdrive/camerad/cameras/debayer.cl | 9 +++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/selfdrive/camerad/cameras/camera_common.cc b/selfdrive/camerad/cameras/camera_common.cc index a3e5eb48beebdd..33bca2f65190f3 100644 --- a/selfdrive/camerad/cameras/camera_common.cc +++ b/selfdrive/camerad/cameras/camera_common.cc @@ -37,6 +37,7 @@ class Debayer { Debayer(cl_device_id device_id, cl_context context, const CameraBuf *b, const CameraState *s) { char args[4096]; const CameraInfo *ci = &s->ci; + hdr_ = ci->hdr; snprintf(args, sizeof(args), "-cl-fast-relaxed-math -cl-denorms-are-zero " "-DFRAME_WIDTH=%d -DFRAME_HEIGHT=%d -DFRAME_STRIDE=%d " @@ -63,9 +64,20 @@ class Debayer { CL_CHECK(clSetKernelArg(krnl_, 2, localMemSize, 0)); CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 2, NULL, globalWorkSize, localWorkSize, 0, 0, debayer_event)); } else { - const size_t debayer_work_size = height; // doesn't divide evenly, is this okay? - CL_CHECK(clSetKernelArg(krnl_, 2, sizeof(float), &gain)); - CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 1, NULL, &debayer_work_size, NULL, 0, 0, debayer_event)); + if (hdr_) { + // HDR requires a 1-D kernel due to the DPCM compression + const size_t debayer_local_worksize = 128; + const size_t debayer_work_size = height; // doesn't divide evenly, is this okay? + CL_CHECK(clSetKernelArg(krnl_, 2, sizeof(float), &gain)); + CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 1, NULL, &debayer_work_size, &debayer_local_worksize, 0, 0, debayer_event)); + } else { + const int debayer_local_worksize = 32; + assert(width % 2 == 0); + const size_t globalWorkSize[] = {size_t(height), size_t(width / 2)}; + const size_t localWorkSize[] = {debayer_local_worksize, debayer_local_worksize}; + CL_CHECK(clSetKernelArg(krnl_, 2, sizeof(float), &gain)); + CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 2, NULL, globalWorkSize, localWorkSize, 0, 0, debayer_event)); + } } } @@ -75,6 +87,7 @@ class Debayer { private: cl_kernel krnl_; + bool hdr_; }; void CameraBuf::init(cl_device_id device_id, cl_context context, CameraState *s, VisionIpcServer * v, int frame_cnt, VisionStreamType init_rgb_type, VisionStreamType init_yuv_type, release_cb init_release_callback) { diff --git a/selfdrive/camerad/cameras/debayer.cl b/selfdrive/camerad/cameras/debayer.cl index 5188dc88c1926e..4e4b832203d3de 100644 --- a/selfdrive/camerad/cameras/debayer.cl +++ b/selfdrive/camerad/cameras/debayer.cl @@ -26,6 +26,8 @@ float3 srgb_gamma(float3 p) { return select(ph, pl, islessequal(p, 0.0031308f)); } +#if HDR + __constant int dpcm_lookup[512] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, 935, 951, 967, 983, 999, 1015, 1031, 1047, 1063, 1079, 1095, 1111, 1127, 1143, 1159, 1175, 1191, 1207, 1223, 1239, 1255, 1271, 1287, 1303, 1319, 1335, 1351, 1367, 1383, 1399, 1415, 1431, -935, -951, -967, -983, -999, -1015, -1031, -1047, -1063, -1079, -1095, -1111, -1127, -1143, -1159, -1175, -1191, -1207, -1223, -1239, -1255, -1271, -1287, -1303, -1319, -1335, -1351, -1367, -1383, -1399, -1415, -1431, 419, 427, 435, 443, 451, 459, 467, 475, 483, 491, 499, 507, 515, 523, 531, 539, 547, 555, 563, 571, 579, 587, 595, 603, 611, 619, 627, 635, 643, 651, 659, 667, 675, 683, 691, 699, 707, 715, 723, 731, 739, 747, 755, 763, 771, 779, 787, 795, 803, 811, 819, 827, 835, 843, 851, 859, 867, 875, 883, 891, 899, 907, 915, 923, -419, -427, -435, -443, -451, -459, -467, -475, -483, -491, -499, -507, -515, -523, -531, -539, -547, -555, -563, -571, -579, -587, -595, -603, -611, -619, -627, -635, -643, -651, -659, -667, -675, -683, -691, -699, -707, -715, -723, -731, -739, -747, -755, -763, -771, -779, -787, -795, -803, -811, -819, -827, -835, -843, -851, -859, -867, -875, -883, -891, -899, -907, -915, -923, 161, 165, 169, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, 213, 217, 221, 225, 229, 233, 237, 241, 245, 249, 253, 257, 261, 265, 269, 273, 277, 281, 285, 289, 293, 297, 301, 305, 309, 313, 317, 321, 325, 329, 333, 337, 341, 345, 349, 353, 357, 361, 365, 369, 373, 377, 381, 385, 389, 393, 397, 401, 405, 409, 413, -161, -165, -169, -173, -177, -181, -185, -189, -193, -197, -201, -205, -209, -213, -217, -221, -225, -229, -233, -237, -241, -245, -249, -253, -257, -261, -265, -269, -273, -277, -281, -285, -289, -293, -297, -301, -305, -309, -313, -317, -321, -325, -329, -333, -337, -341, -345, -349, -353, -357, -361, -365, -369, -373, -377, -381, -385, -389, -393, -397, -401, -405, -409, -413, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, -32, -34, -36, -38, -40, -42, -44, -46, -48, -50, -52, -54, -56, -58, -60, -62, -64, -66, -68, -70, -72, -74, -76, -78, -80, -82, -84, -86, -88, -90, -92, -94, -96, -98, -100, -102, -104, -106, -108, -110, -112, -114, -116, -118, -120, -122, -124, -126, -128, -130, -132, -134, -136, -138, -140, -142, -144, -146, -148, -150, -152, -154, -156, -158}; inline uint4 decompress(uint4 p, uint4 pl) { @@ -35,6 +37,8 @@ inline uint4 decompress(uint4 p, uint4 pl) { return select(r2, r1, p < 0x200); } +#endif + __kernel void debayer10(__global uchar const * const in, __global uchar * out, float digital_gain) { @@ -42,8 +46,13 @@ __kernel void debayer10(__global uchar const * const in, if (oy >= RGB_HEIGHT) return; const int iy = oy * 2; +#if HDR uint4 pint_last; for (int ox = 0; ox < RGB_WIDTH; ox += 2) { +#else + int ox = get_global_id(1) * 2; + { +#endif const int ix = (ox/2) * 5; // TODO: why doesn't this work for the frontview