Skip to content

Commit

Permalink
Merge pull request fireice-uk#2081 from psychocrypt/topic-reduceAPIOv…
Browse files Browse the repository at this point in the history
…erhead

OpenCL reduce API overhead
  • Loading branch information
fireice-uk authored Nov 21, 2018
2 parents 922f5f6 + c3c4f7b commit de33fa3
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 60 deletions.
59 changes: 11 additions & 48 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1116,6 +1116,12 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
return ERR_OCL_API;
}

if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
return(ERR_OCL_API);
}
}

return ERR_SUCCESS;
Expand Down Expand Up @@ -1158,8 +1164,6 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
return ERR_OCL_API;
}

clFinish(ctx->CommandQueues);

size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
{
Expand All @@ -1181,64 +1185,23 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
return ERR_OCL_API;
}

if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
return ERR_OCL_API;
}

if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
return ERR_OCL_API;
}

if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
return ERR_OCL_API;
}

if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
return ERR_OCL_API;
}

clFinish(ctx->CommandQueues);

for(int i = 0; i < 4; ++i)
{
if(BranchNonces[i])
size_t tmpNonce = ctx->Nonce;
if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
{
// Threads
cl_uint numThreads = BranchNonces[i];
if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
return(ERR_OCL_API);
}

// round up to next multiple of w_size
BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size;
// number of global threads must be a multiple of the work group size (w_size)
assert(BranchNonces[i]%w_size == 0);
size_t tmpNonce = ctx->Nonce;
if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
return ERR_OCL_API;
}
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
return ERR_OCL_API;
}
}

// this call is blocking therefore the access to the results without cl_finish is fine
if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
return ERR_OCL_API;
}

clFinish(ctx->CommandQueues);
auto & numHashValues = HashOutput[0xFF];
// avoid out of memory read, we have only storage for 0xFF results
if(numHashValues > 0xFF)
Expand Down
24 changes: 12 additions & 12 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
AES2[i] = rotate(tmp, 16U);
AES3[i] = rotate(tmp, 24U);
}

__local ulong State_buf[8 * 25];

barrier(CLK_LOCAL_MEM_FENCE);
Expand Down Expand Up @@ -474,12 +474,12 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
}

mem_fence(CLK_LOCAL_MEM_FENCE);

// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
__local uint4 xin[8][8];
{


/* Also left over threads perform this loop.
* The left over thread results will be ignored
Expand Down Expand Up @@ -530,7 +530,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,

)==="
R"===(

// cryptonight_monero_v8 && NVIDIA
#if(ALGO==11 && defined(__NV_CL_C_VERSION))
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
Expand Down Expand Up @@ -628,7 +628,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
tweak1_2 ^= as_uint2(states[24]);
#endif
}

mem_fence(CLK_LOCAL_MEM_FENCE);

#if(COMP_MODE==1)
Expand Down Expand Up @@ -753,7 +753,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#endif

((uint4 *)a)[0] ^= tmp;

// cryptonight_monero_v8
#if (ALGO == 11)
# if defined(__NV_CL_C_VERSION)
Expand Down Expand Up @@ -813,7 +813,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
__local uint4 xin1[8][8];
__local uint4 xin2[8][8];
#endif

#if(COMP_MODE==1)
// do not use early return here
if(gIdx < Threads)
Expand Down Expand Up @@ -969,7 +969,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
const ulong idx = get_global_id(0) - get_global_offset(0);

// do not use early return here
if(idx < Threads)
if(idx < BranchBuf[Threads])
{
states += 25 * BranchBuf[idx];

Expand Down Expand Up @@ -1017,8 +1017,8 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
ulong outIdx = atomic_inc(output + 0xFF);
if(outIdx < 0xFF)
output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
}
}
}
mem_fence(CLK_GLOBAL_MEM_FENCE);
}

Expand Down Expand Up @@ -1050,7 +1050,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
const uint idx = get_global_id(0) - get_global_offset(0);

// do not use early return here
if(idx < Threads)
if(idx < BranchBuf[Threads])
{
states += 25 * BranchBuf[idx];

Expand Down Expand Up @@ -1104,7 +1104,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
const uint idx = get_global_id(0) - get_global_offset(0);

// do not use early return here
if(idx < Threads)
if(idx < BranchBuf[Threads])
{
states += 25 * BranchBuf[idx];

Expand Down Expand Up @@ -1180,7 +1180,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
const uint idx = get_global_id(0) - get_global_offset(0);

// do not use early return here
if(idx < Threads)
if(idx < BranchBuf[Threads])
{
states += 25 * BranchBuf[idx];

Expand Down

0 comments on commit de33fa3

Please sign in to comment.