From 6c563c9d968c51f2dee286eb80a0677d2171fca3 Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Mon, 19 Nov 2018 22:06:32 +0100
Subject: [PATCH] OpenCL reduce API overhead

- remove useless `clFinish`
- avoid download num threads for skein&co and start always as much threads as in all other kernel (terminate useless threads)
---
 xmrstak/backend/amd/amd_gpu/gpu.cpp           | 59 ++++---------------
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 24 ++++----
 2 files changed, 23 insertions(+), 60 deletions(-)

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 6e1c70b05..734ebb63b 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -1116,6 +1116,12 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 			return ERR_OCL_API;
 		}
+
+		if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+			return(ERR_OCL_API);
+		}
 	}
 
 	return ERR_SUCCESS;
@@ -1158,8 +1164,6 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 		return ERR_OCL_API;
 	}
 
-	clFinish(ctx->CommandQueues);
-
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
@@ -1181,64 +1185,23 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	clFinish(ctx->CommandQueues);
-
 	for(int i = 0; i < 4; ++i)
 	{
-		if(BranchNonces[i])
+		size_t tmpNonce = ctx->Nonce;
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			// Threads
-            cl_uint numThreads = BranchNonces[i];
-			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
-			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
-				return(ERR_OCL_API);
-			}
-
-			// round up to next multiple of w_size
-			BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size;
-			// number of global threads must be a multiple of the work group size (w_size)
-			assert(BranchNonces[i]%w_size == 0);
-			size_t tmpNonce = ctx->Nonce;
-			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
-			{
-				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
-				return ERR_OCL_API;
-			}
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+			return ERR_OCL_API;
 		}
 	}
 
+	// this call is blocking therefore the access to the results without cl_finish is fine
 	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	clFinish(ctx->CommandQueues);
 	auto & numHashValues = HashOutput[0xFF];
 	// avoid out of memory read, we have only storage for 0xFF results
 	if(numHashValues > 0xFF)
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 81c0d5ff9..dd9f1d4e2 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
         AES2[i] = rotate(tmp, 16U);
         AES3[i] = rotate(tmp, 24U);
     }
-        
+
     __local ulong State_buf[8 * 25];
 
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -474,12 +474,12 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
     }
 
     mem_fence(CLK_LOCAL_MEM_FENCE);
-        
+
 // cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 9 || ALGO == 10)
     __local uint4 xin[8][8];
     {
-        
+
 
         /* Also left over threads perform this loop.
          * The left over thread results will be ignored
@@ -530,7 +530,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 
 )==="
 R"===(
-        
+
 // cryptonight_monero_v8 && NVIDIA
 #if(ALGO==11 && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
@@ -630,7 +630,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
     }
-    
+
     mem_fence(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
@@ -755,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 
         ((uint4 *)a)[0] ^= tmp;
-    
+
 // cryptonight_monero_v8
 #if (ALGO == 11)
 #	if defined(__NV_CL_C_VERSION)
@@ -815,7 +815,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     __local uint4 xin1[8][8];
     __local uint4 xin2[8][8];
 #endif
-        
+
 #if(COMP_MODE==1)
     // do not use early return here
     if(gIdx < Threads)
@@ -971,7 +971,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
     const ulong idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1019,8 +1019,8 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
             ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
         }
+    }
     mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
@@ -1052,7 +1052,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1106,7 +1106,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1182,7 +1182,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];