From be2144d639caa15511ad05eca1018f3e57e79c70 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Sat, 9 Mar 2019 22:42:44 +0100 Subject: [PATCH] fix masari Since masari increased the block size the miner crashed each time it gets connected with a masari pool. This PR extent the possible size of a block to 128 byte and updated the kernel. --- xmrstak/backend/amd/amd_gpu/gpu.cpp | 8 +-- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 13 +++-- xmrstak/backend/amd/autoAdjust.hpp | 4 +- xmrstak/backend/amd/minethd.cpp | 2 +- xmrstak/backend/miner_work.hpp | 6 +-- xmrstak/backend/nvidia/minethd.cpp | 2 +- .../backend/nvidia/nvcc_code/cuda_extra.cu | 4 +- .../backend/nvidia/nvcc_code/cuda_keccak.hpp | 49 +------------------ xmrstak/cli/cli-miner.cpp | 11 ++--- xmrstak/net/msgstruct.hpp | 2 +- 10 files changed, 29 insertions(+), 72 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index ace1c34bb..8de8d7b3a 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -199,7 +199,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } - ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret); + ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret)); @@ -889,15 +889,15 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar cl_int ret; - if(input_len > 84) + if(input_len > 124) return ERR_STUPID_PARAMS; input[input_len] = 0x01; - memset(input + input_len + 1, 0, 88 - input_len - 1); + memset(input + input_len + 1, 0, 128 - input_len - 1); cl_uint numThreads = ctx->rawIntensity; - if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS) + if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret)); return ERR_OCL_API; diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index b78f2bcf7..4b1016acf 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -539,6 +539,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, State[8] = input[8]; State[9] = input[9]; State[10] = input[10]; + State[11] = input[11]; + State[12] = input[12]; + State[13] = input[13]; + State[14] = input[14]; + State[15] = input[15]; ((__local uint *)State)[9] &= 0x00FFFFFFU; ((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24; @@ -550,13 +555,13 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, */ ((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8)); - for (int i = 11; i < 25; ++i) { - State[i] = 0x00UL; - } - // Last bit of padding State[16] = 0x8000000000000000UL; + for (int i = 17; i < 25; ++i) { + State[i] = 0x00UL; + } + keccakf1600_2(State); #pragma unroll diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index ea688e053..120fb6898 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -187,8 +187,8 @@ class autoAdjust memPerThread = std::min(memPerThread, memDoubleThread); } - // 224byte extra memory is used per thread for meta data - size_t perThread = hashMemSize + 224u; + // 240byte extra memory is used per thread for meta data + size_t perThread = hashMemSize + 240u; size_t maxIntensity = memPerThread / perThread; size_t possibleIntensity = std::min( maxThreads , maxIntensity ); // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 09e030e66..3be593175 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -273,7 +273,7 @@ void minethd::work_main() for(size_t i = 0; i < results[0xFF]; i++) { - uint8_t bWorkBlob[112]; + uint8_t bWorkBlob[128]; uint8_t bResult[32]; memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); diff --git a/xmrstak/backend/miner_work.hpp b/xmrstak/backend/miner_work.hpp index c8174df32..d0e5237f2 100644 --- a/xmrstak/backend/miner_work.hpp +++ b/xmrstak/backend/miner_work.hpp @@ -15,7 +15,7 @@ namespace xmrstak struct miner_work { char sJobID[64]; - uint8_t bWorkBlob[112]; + uint8_t bWorkBlob[128]; uint32_t iWorkSize; uint64_t iTarget; bool bNiceHash; @@ -28,7 +28,7 @@ namespace xmrstak miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize, uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize), - iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight) + iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight) { assert(iWorkSize <= sizeof(bWorkBlob)); memcpy(this->bWorkBlob, bWork, iWorkSize); @@ -36,7 +36,7 @@ namespace xmrstak } miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget), - bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) + bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight) { assert(iWorkSize <= sizeof(bWorkBlob)); memcpy(bWorkBlob, from.bWorkBlob, iWorkSize); diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index a50dd30cc..80615d7a3 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -285,7 +285,7 @@ void minethd::work_main() for(size_t i = 0; i < foundCount; i++) { - uint8_t bWorkBlob[112]; + uint8_t bWorkBlob[128]; uint8_t bResult[32]; memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize); diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index e909e2fa3..b6e41c619 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -112,7 +112,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric uint32_t ctx_b[4]; uint32_t ctx_key1[40]; uint32_t ctx_key2[40]; - uint32_t input[21]; + uint32_t input[32]; memcpy( input, d_input, len ); //*((uint32_t *)(((char *)input) + 39)) = startNonce + thread; @@ -349,7 +349,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize)); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size)); // POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format - CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) )); + CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) )); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) )); CUDA_CHECK_MSG( diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp index c75c74964..3f535631d 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp @@ -103,54 +103,7 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s) { uint64_t bc[5], tmpxor[5], tmp1, tmp2; - tmpxor[0] = s[0] ^ s[5]; - tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL; - tmpxor[2] = s[2] ^ s[7]; - tmpxor[3] = s[3] ^ s[8]; - tmpxor[4] = s[4] ^ s[9]; - - bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1); - bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1); - bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1); - bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1); - bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1); - - tmp1 = s[1] ^ bc[0]; - - s[0] ^= bc[4]; - s[1] = rotl64_2(s[6] ^ bc[0], 12); - s[6] = rotl64_1(s[9] ^ bc[3], 20); - s[9] = rotl64_2(bc[1], 29); - s[22] = rotl64_2(bc[3], 7); - s[14] = rotl64_1(bc[4], 18); - s[20] = rotl64_2(s[2] ^ bc[1], 30); - s[2] = rotl64_2(bc[1], 11); - s[12] = rotl64_1(bc[2], 25); - s[13] = rotl64_1(bc[3], 8); - s[19] = rotl64_2(bc[2], 24); - s[23] = rotl64_2(bc[4], 9); - s[15] = rotl64_1(s[4] ^ bc[3], 27); - s[4] = rotl64_1(bc[3], 14); - s[24] = rotl64_1(bc[0], 2); - s[21] = rotl64_2(s[8] ^ bc[2], 23); - s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13); - s[16] = rotl64_2(s[5] ^ bc[4], 4); - s[5] = rotl64_1(s[3] ^ bc[2], 28); - s[3] = rotl64_1(bc[2], 21); - s[18] = rotl64_1(bc[1], 15); - s[17] = rotl64_1(bc[0], 10); - s[11] = rotl64_1(s[7] ^ bc[1], 6); - s[7] = rotl64_1(bc[4], 3); - s[10] = rotl64_1(tmp1, 1); - - tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); - tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); - tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); - tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); - tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); - s[0] ^= 0x0000000000000001; - - for(int i = 1; i < 24; ++i) + for(int i = 0; i < 24; ++i) { tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; diff --git a/xmrstak/cli/cli-miner.cpp b/xmrstak/cli/cli-miner.cpp index 418726208..d6822cd63 100644 --- a/xmrstak/cli/cli-miner.cpp +++ b/xmrstak/cli/cli-miner.cpp @@ -850,8 +850,8 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version); - uint8_t work[112]; - memset(work,0,112); + uint8_t work[128]; + memset(work,0,128); work[0] = static_cast(block_version); xmrstak::pool_data dat; @@ -862,15 +862,14 @@ int do_benchmark(int block_version, int wait_sec, int work_sec) printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec); std::this_thread::sleep_for(std::chrono::seconds(wait_sec)); - /* AMD and NVIDIA is currently only supporting work sizes up to 84byte - * \todo fix this issue + /* AMD and NVIDIA is currently only supporting work sizes up to 128byte */ printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec); - xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat); + xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat); uint64_t iStartStamp = get_timestamp_ms(); std::this_thread::sleep_for(std::chrono::seconds(work_sec)); - xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat); + xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat); double fTotalHps = 0.0; for (uint32_t i = 0; i < pvThreads->size(); i++) diff --git a/xmrstak/net/msgstruct.hpp b/xmrstak/net/msgstruct.hpp index 813fc7d06..33980bf42 100644 --- a/xmrstak/net/msgstruct.hpp +++ b/xmrstak/net/msgstruct.hpp @@ -12,7 +12,7 @@ struct pool_job { char sJobID[64]; - uint8_t bWorkBlob[112]; + uint8_t bWorkBlob[128]; uint64_t iTarget; uint32_t iWorkLen; uint32_t iSavedNonce;