From 6c563c9d968c51f2dee286eb80a0677d2171fca3 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 19 Nov 2018 22:06:32 +0100 Subject: [PATCH] OpenCL reduce API overhead - remove useless `clFinish` - avoid download num threads for skein&co and start always as much threads as in all other kernel (terminate useless threads) --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 59 ++++--------------- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 24 ++++---- 2 files changed, 23 insertions(+), 60 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 6e1c70b05..734ebb63b 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -1116,6 +1116,12 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; } + + if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return(ERR_OCL_API); + } } return ERR_SUCCESS; @@ -1158,8 +1164,6 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) return ERR_OCL_API; } - clFinish(ctx->CommandQueues); - size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { @@ -1181,64 +1185,23 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) return ERR_OCL_API; } - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - clFinish(ctx->CommandQueues); - for(int i = 0; i < 4; ++i) { - if(BranchNonces[i]) + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { - // Threads - cl_uint numThreads = BranchNonces[i]; - if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); - return(ERR_OCL_API); - } - - // round up to next multiple of w_size - BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size; - // number of global threads must be a multiple of the work group size (w_size) - assert(BranchNonces[i]%w_size == 0); - size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); - return ERR_OCL_API; - } + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + return ERR_OCL_API; } } + // this call is blocking therefore the access to the results without cl_finish is fine if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - clFinish(ctx->CommandQueues); auto & numHashValues = HashOutput[0xFF]; // avoid out of memory read, we have only storage for 0xFF results if(numHashValues > 0xFF) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 81c0d5ff9..dd9f1d4e2 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); } - + __local ulong State_buf[8 * 25]; barrier(CLK_LOCAL_MEM_FENCE); @@ -474,12 +474,12 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_LOCAL_MEM_FENCE); - + // cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 9 || ALGO == 10) __local uint4 xin[8][8]; { - + /* Also left over threads perform this loop. * The left over thread results will be ignored @@ -530,7 +530,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, )===" R"===( - + // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) @@ -630,7 +630,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states tweak1_2 ^= as_uint2(states[24]); #endif } - + mem_fence(CLK_LOCAL_MEM_FENCE); #if(COMP_MODE==1) @@ -755,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ((uint4 *)a)[0] ^= tmp; - + // cryptonight_monero_v8 #if (ALGO == 11) # if defined(__NV_CL_C_VERSION) @@ -815,7 +815,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif - + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -971,7 +971,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u const ulong idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1019,8 +1019,8 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); - } } + } mem_fence(CLK_GLOBAL_MEM_FENCE); } @@ -1052,7 +1052,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1106,7 +1106,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1182,7 +1182,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx];