From f40c54e34cad70bc164b1dc92dc0a78926ea09e6 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Tue, 20 Nov 2018 20:50:12 +0100 Subject: [PATCH] AMD: use more 32bit operations - change a few 64bit variables into 32bit. - provide defines type quallified --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 9 +++-- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 34 +++++++++---------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 6e1c70b05..fb200f6ba 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -43,7 +43,6 @@ #endif - #ifdef _WIN32 #include #include @@ -412,12 +411,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ std::string options; options += " -DITERATIONS=" + std::to_string(hashIterations); - options += " -DMASK=" + std::to_string(threadMemMask); - options += " -DWORKSIZE=" + std::to_string(ctx->workSize); + options += " -DMASK=" + std::to_string(threadMemMask) + "U"; + options += " -DWORKSIZE=" + std::to_string(ctx->workSize) + "U"; options += " -DSTRIDED_INDEX=" + std::to_string(strided_index); - options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp); + options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U"; options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); - options += " -DMEMORY=" + std::to_string(hashMemSize); + options += " -DMEMORY=" + std::to_string(hashMemSize) + "LLU"; options += " -DALGO=" + std::to_string(miner_algo[ii]); options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); /* AMD driver output is something like: `1445.5 (VM)` diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 81c0d5ff9..49213e694 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); } - + __local ulong State_buf[8 * 25]; barrier(CLK_LOCAL_MEM_FENCE); @@ -474,12 +474,12 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_LOCAL_MEM_FENCE); - + // cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 9 || ALGO == 10) __local uint4 xin[8][8]; { - + /* Also left over threads perform this loop. * The left over thread results will be ignored @@ -530,7 +530,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, )===" R"===( - + // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) @@ -630,7 +630,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states tweak1_2 ^= as_uint2(states[24]); #endif } - + mem_fence(CLK_LOCAL_MEM_FENCE); #if(COMP_MODE==1) @@ -638,7 +638,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0] & MASK; + uint idx0 = as_uint2(a[0]).s0 & MASK; #pragma unroll CN_UNROLL for(int i = 0; i < ITERATIONS; ++i) @@ -646,7 +646,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong c[2]; // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) - ulong idxS = idx0 & 0x30; + uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif @@ -682,23 +682,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # endif b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; SCRATCHPAD_CHUNK(0) = b_x[0]; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; // cryptonight_monero_v8 #elif(ALGO==11) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; idxS = idx0 & 0x30; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; # else - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; # endif #else b_x[0] ^= ((uint4 *)c)[0]; SCRATCHPAD_CHUNK(0) = b_x[0]; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; #endif uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); @@ -755,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ((uint4 *)a)[0] ^= tmp; - + // cryptonight_monero_v8 #if (ALGO == 11) # if defined(__NV_CL_C_VERSION) @@ -765,7 +765,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[1] = b_x[0]; #endif b_x[0] = ((uint4 *)c)[0]; - idx0 = a[0] & MASK; + idx0 = as_uint2(a[0]).s0 & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) @@ -773,14 +773,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; - idx0 = (d ^ q) & MASK; + idx0 = (d ^ as_int2(q).s0) & MASK; // cryptonight_haven #elif (ALGO == 9) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; - idx0 = ((~d) ^ q) & MASK; + idx0 = ((~d) ^ as_int2(q).s0) & MASK; #endif } @@ -815,7 +815,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif - + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -1238,4 +1238,4 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global } } -)===" \ No newline at end of file +)==="