From bff5b000f3df79d0658057447534dcbe640b965f Mon Sep 17 00:00:00 2001 From: SChernykh Date: Tue, 20 Nov 2018 22:08:33 +0100 Subject: [PATCH] OpenCl: optimize cn-v8 div - optimize division --- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 14 ++-- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 74 ++++++------------- 2 files changed, 31 insertions(+), 57 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 81c0d5ff9..5f9c370b3 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); } - + __local ulong State_buf[8 * 25]; barrier(CLK_LOCAL_MEM_FENCE); @@ -474,12 +474,12 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_LOCAL_MEM_FENCE); - + // cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 9 || ALGO == 10) __local uint4 xin[8][8]; { - + /* Also left over threads perform this loop. * The left over thread results will be ignored @@ -530,7 +530,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, )===" R"===( - + // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) @@ -630,7 +630,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states tweak1_2 ^= as_uint2(states[24]); #endif } - + mem_fence(CLK_LOCAL_MEM_FENCE); #if(COMP_MODE==1) @@ -755,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ((uint4 *)a)[0] ^= tmp; - + // cryptonight_monero_v8 #if (ALGO == 11) # if defined(__NV_CL_C_VERSION) @@ -815,7 +815,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif - + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index 2c1b13865..f0e923479 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -68,61 +68,35 @@ inline uint get_reciprocal(const __local uchar *RCP, uint a) inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) { - const uint r = get_reciprocal((const __local uchar *)RCP, b); - const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; - - ulong q; - ((uint*)&q)[0] = as_uint2(k).s1; - -#if defined(cl_amd_device_attribute_query) && (OPENCL_DRIVER_MAJOR == 14) - /* The AMD driver 14.XX is not able to compile `(k < a)` - * https://github.com/fireice-uk/xmr-stak/issues/1922 - * This is a workaround for the broken compiler. - */ - ulong whyAMDwhy; - ((uint*)&whyAMDwhy)[0] = as_uint2(k).s0; - ((uint*)&whyAMDwhy)[1] = as_uint2(k).s1; - ((uint*)&q)[1] = (whyAMDwhy < a) ? 1U : 0U; -#else - ((uint*)&q)[1] = (k < a) ? 1U : 0U; -#endif - - const long tmp = a - q * b; - const bool overshoot = (tmp < 0); - const bool undershoot = (tmp >= b); - - return (uint2)( - as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U), - as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U) - ); + const uint r = get_reciprocal((const __local uchar *)RCP, b); + + const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; + const uint q = as_uint2(k).s1; + long tmp = a - ((ulong)(q) * b); + ((int*)&tmp)[1] -= (as_uint2(k).s1 < as_uint2(a).s1) ? b : 0; + const int overshoot = ((int*)&tmp)[1] >> 31; + const int undershoot = as_int2(as_uint(b - 1) - tmp).s1 >> 31; + return (uint2)(q + overshoot - undershoot, as_uint2(tmp).s0 + (as_uint(overshoot) & b) - (as_uint(undershoot) & b)); } - inline uint fast_sqrt_v2(const ulong n1) { - float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); - - float x1 = native_rsqrt(x); - x = native_sqrt(x); - - // The following line does x1 *= 4294967296.0f; - x1 = as_float(as_uint(x1) + (32U << 23)); - - const uint x0 = as_uint(x) - (158U << 23); - const long delta0 = n1 - (((long)(x0) * x0) << 18); - const float delta = convert_float_rte(as_int2(delta0).s1) * x1; - - uint result = (x0 << 10) + convert_int_rte(delta); - const uint s = result >> 1; - const uint b = result & 1; - - const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; - if ((long)(x2 + b) > 0) --result; - if ((long)(x2 + 0x100000000UL + s) < 0) ++result; - - return result; + float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + float x1 = native_rsqrt(x); + x = native_sqrt(x); + // The following line does x1 *= 4294967296.0f; + x1 = as_float(as_uint(x1) + (32U << 23)); + const uint x0 = as_uint(x) - (158U << 23); + const long delta0 = n1 - (as_ulong((uint2)(mul24(x0, x0), mul_hi(x0, x0))) << 18); + const float delta = convert_float_rte(as_int2(delta0).s1) * x1; + uint result = (x0 << 10) + convert_int_rte(delta); + const uint s = result >> 1; + const uint b = result & 1; + const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; + if ((long)(x2 + as_int(b - 1)) >= 0) --result; + if ((long)(x2 + 0x100000000UL + s) < 0) ++result; + return result; } #endif )===" - \ No newline at end of file