Skip to content

Commit

Permalink
Merge pull request fireice-uk#2296 from psychocrypt/fix-masari
Browse files Browse the repository at this point in the history
fix masari
  • Loading branch information
fireice-uk authored Mar 10, 2019
2 parents edf4b01 + c89c375 commit c6f418e
Show file tree
Hide file tree
Showing 10 changed files with 29 additions and 72 deletions.
8 changes: 4 additions & 4 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
return ERR_OCL_API;
}

ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 128, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateBuffer to create input buffer.", err_to_str(ret));
Expand Down Expand Up @@ -889,15 +889,15 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar

cl_int ret;

if(input_len > 84)
if(input_len > 124)
return ERR_STUPID_PARAMS;

input[input_len] = 0x01;
memset(input + input_len + 1, 0, 88 - input_len - 1);
memset(input + input_len + 1, 0, 128 - input_len - 1);

cl_uint numThreads = ctx->rawIntensity;

if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 88, input, 0, NULL, NULL)) != CL_SUCCESS)
if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->InputBuffer, CL_TRUE, 0, 128, input, 0, NULL, NULL)) != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fill input buffer.", err_to_str(ret));
return ERR_OCL_API;
Expand Down
13 changes: 9 additions & 4 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
State[8] = input[8];
State[9] = input[9];
State[10] = input[10];
State[11] = input[11];
State[12] = input[12];
State[13] = input[13];
State[14] = input[14];
State[15] = input[15];

((__local uint *)State)[9] &= 0x00FFFFFFU;
((__local uint *)State)[9] |= (((uint)get_global_id(0)) & 0xFF) << 24;
Expand All @@ -550,13 +555,13 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
*/
((__local uint *)State)[10] |= (((uint)get_global_id(0) >> 8));

for (int i = 11; i < 25; ++i) {
State[i] = 0x00UL;
}

// Last bit of padding
State[16] = 0x8000000000000000UL;

for (int i = 17; i < 25; ++i) {
State[i] = 0x00UL;
}

keccakf1600_2(State);

#pragma unroll
Expand Down
4 changes: 2 additions & 2 deletions xmrstak/backend/amd/autoAdjust.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ class autoAdjust
memPerThread = std::min(memPerThread, memDoubleThread);
}

// 224byte extra memory is used per thread for meta data
size_t perThread = hashMemSize + 224u;
// 240byte extra memory is used per thread for meta data
size_t perThread = hashMemSize + 240u;
size_t maxIntensity = memPerThread / perThread;
size_t possibleIntensity = std::min( maxThreads , maxIntensity );
// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ void minethd::work_main()

for(size_t i = 0; i < results[0xFF]; i++)
{
uint8_t bWorkBlob[112];
uint8_t bWorkBlob[128];
uint8_t bResult[32];

memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
Expand Down
6 changes: 3 additions & 3 deletions xmrstak/backend/miner_work.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace xmrstak
struct miner_work
{
char sJobID[64];
uint8_t bWorkBlob[112];
uint8_t bWorkBlob[128];
uint32_t iWorkSize;
uint64_t iTarget;
bool bNiceHash;
Expand All @@ -28,15 +28,15 @@ namespace xmrstak

miner_work(const char* sJobID, const uint8_t* bWork, uint32_t iWorkSize,
uint64_t iTarget, bool bNiceHash, size_t iPoolId, uint64_t iBlockHeiht) : iWorkSize(iWorkSize),
iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight)
iTarget(iTarget), bNiceHash(bNiceHash), bStall(false), iPoolId(iPoolId), iBlockHeight(iBlockHeiht), ref_ptr((uint8_t*)&iBlockHeight)
{
assert(iWorkSize <= sizeof(bWorkBlob));
memcpy(this->bWorkBlob, bWork, iWorkSize);
memcpy(this->sJobID, sJobID, sizeof(miner_work::sJobID));
}

miner_work(miner_work&& from) : iWorkSize(from.iWorkSize), iTarget(from.iTarget),
bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight)
bStall(from.bStall), iPoolId(from.iPoolId), iBlockHeight(from.iBlockHeight), ref_ptr((uint8_t*)&iBlockHeight)
{
assert(iWorkSize <= sizeof(bWorkBlob));
memcpy(bWorkBlob, from.bWorkBlob, iWorkSize);
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/nvidia/minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ void minethd::work_main()
for(size_t i = 0; i < foundCount; i++)
{

uint8_t bWorkBlob[112];
uint8_t bWorkBlob[128];
uint8_t bResult[32];

memcpy(bWorkBlob, oWork.bWorkBlob, oWork.iWorkSize);
Expand Down
4 changes: 2 additions & 2 deletions xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
uint32_t ctx_b[4];
uint32_t ctx_key1[40];
uint32_t ctx_key2[40];
uint32_t input[21];
uint32_t input[32];

memcpy( input, d_input, len );
//*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
Expand Down Expand Up @@ -349,7 +349,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_a, 4 * sizeof(uint32_t) * wsize));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_ctx_b, ctx_b_size));
// POW block format http://monero.wikia.com/wiki/PoW_Block_Header_Format
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 21 * sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_input, 32 * sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_count, sizeof (uint32_t ) ));
CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_result_nonce, 10 * sizeof (uint32_t ) ));
CUDA_CHECK_MSG(
Expand Down
49 changes: 1 addition & 48 deletions xmrstak/backend/nvidia/nvcc_code/cuda_keccak.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,54 +103,7 @@ __device__ __forceinline__ void cn_keccakf(uint64_t *s)
{
uint64_t bc[5], tmpxor[5], tmp1, tmp2;

tmpxor[0] = s[0] ^ s[5];
tmpxor[1] = s[1] ^ s[6] ^ 0x8000000000000000ULL;
tmpxor[2] = s[2] ^ s[7];
tmpxor[3] = s[3] ^ s[8];
tmpxor[4] = s[4] ^ s[9];

bc[0] = tmpxor[0] ^ rotl64_1(tmpxor[2], 1);
bc[1] = tmpxor[1] ^ rotl64_1(tmpxor[3], 1);
bc[2] = tmpxor[2] ^ rotl64_1(tmpxor[4], 1);
bc[3] = tmpxor[3] ^ rotl64_1(tmpxor[0], 1);
bc[4] = tmpxor[4] ^ rotl64_1(tmpxor[1], 1);

tmp1 = s[1] ^ bc[0];

s[0] ^= bc[4];
s[1] = rotl64_2(s[6] ^ bc[0], 12);
s[6] = rotl64_1(s[9] ^ bc[3], 20);
s[9] = rotl64_2(bc[1], 29);
s[22] = rotl64_2(bc[3], 7);
s[14] = rotl64_1(bc[4], 18);
s[20] = rotl64_2(s[2] ^ bc[1], 30);
s[2] = rotl64_2(bc[1], 11);
s[12] = rotl64_1(bc[2], 25);
s[13] = rotl64_1(bc[3], 8);
s[19] = rotl64_2(bc[2], 24);
s[23] = rotl64_2(bc[4], 9);
s[15] = rotl64_1(s[4] ^ bc[3], 27);
s[4] = rotl64_1(bc[3], 14);
s[24] = rotl64_1(bc[0], 2);
s[21] = rotl64_2(s[8] ^ bc[2], 23);
s[8] = rotl64_2(0x8000000000000000ULL ^ bc[0], 13);
s[16] = rotl64_2(s[5] ^ bc[4], 4);
s[5] = rotl64_1(s[3] ^ bc[2], 28);
s[3] = rotl64_1(bc[2], 21);
s[18] = rotl64_1(bc[1], 15);
s[17] = rotl64_1(bc[0], 10);
s[11] = rotl64_1(s[7] ^ bc[1], 6);
s[7] = rotl64_1(bc[4], 3);
s[10] = rotl64_1(tmp1, 1);

tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
s[0] ^= 0x0000000000000001;

for(int i = 1; i < 24; ++i)
for(int i = 0; i < 24; ++i)
{
tmpxor[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
tmpxor[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
Expand Down
11 changes: 5 additions & 6 deletions xmrstak/cli/cli-miner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -850,8 +850,8 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)

printer::inst()->print_msg(L0, "Prepare benchmark for block version %d", block_version);

uint8_t work[112];
memset(work,0,112);
uint8_t work[128];
memset(work,0,128);
work[0] = static_cast<uint8_t>(block_version);

xmrstak::pool_data dat;
Expand All @@ -862,15 +862,14 @@ int do_benchmark(int block_version, int wait_sec, int work_sec)
printer::inst()->print_msg(L0, "Wait %d sec until all backends are initialized",wait_sec);
std::this_thread::sleep_for(std::chrono::seconds(wait_sec));

/* AMD and NVIDIA is currently only supporting work sizes up to 84byte
* \todo fix this issue
/* AMD and NVIDIA is currently only supporting work sizes up to 128byte
*/
printer::inst()->print_msg(L0, "Start a %d second benchmark...",work_sec);
xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat);
xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);
uint64_t iStartStamp = get_timestamp_ms();

std::this_thread::sleep_for(std::chrono::seconds(work_sec));
xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 84, 0, false, 0, 0), dat);
xmrstak::globalStates::inst().switch_work(xmrstak::miner_work("", work, 128, 0, false, 0, 0), dat);

double fTotalHps = 0.0;
for (uint32_t i = 0; i < pvThreads->size(); i++)
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/net/msgstruct.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
struct pool_job
{
char sJobID[64];
uint8_t bWorkBlob[112];
uint8_t bWorkBlob[128];
uint64_t iTarget;
uint32_t iWorkLen;
uint32_t iSavedNonce;
Expand Down

0 comments on commit c6f418e

Please sign in to comment.