Skip to content

Commit

Permalink
Merge pull request fireice-uk#2285 from fireice-uk/dev
Browse files Browse the repository at this point in the history
release 2.10.0
  • Loading branch information
fireice-uk authored Mar 7, 2019
2 parents c77a62d + edf4b01 commit 7591274
Showing 40 changed files with 6,937 additions and 244 deletions.
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -508,18 +508,23 @@ if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
# asm optimized monero v8 code
enable_language(ASM_MASM)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.asm" PROPERTY ASM_MASM)
add_library(xmr-stak-asm
STATIC
"xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm"
"xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.asm"
)
else()
# asm optimized monero v8 code
enable_language(ASM)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S" PROPERTY CPP)
set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
add_library(xmr-stak-asm
STATIC
"xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
"xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S"
)
endif()

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -71,6 +71,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
- cryptonight_v7_stellite
- cryptonight_v8
- cryptonight_v8_half (used by masari and stellite)
- cryptonight_v8_reversewaltz (used by graft)
- cryptonight_v8_zelerius
- 4MiB scratchpad memory
- cryptonight_haven
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/OclCryptonightR_gen.cpp
Original file line number Diff line number Diff line change
@@ -282,7 +282,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
code_size = v4_random_math_init<cryptonight_r>(code, height);
break;
default:
printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
return nullptr;
}

36 changes: 14 additions & 22 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
@@ -294,7 +294,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
* this is required if the dev pool is mining monero
* but the user tuned there settings for another currency
*/
if(miner_algo == cryptonight_monero_v8)
if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_v8_reversewaltz)
{
if(ctx->memChunk < 2)
mem_chunk_exp = 1u << 2;
@@ -774,7 +774,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
// Same as the platform index sanity check, except we must check all requested device indexes
for(int i = 0; i < num_gpus; ++i)
{
if(entries <= ctx[i].deviceIdx)
if(ctx[i].deviceIdx >= entries)
{
printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
return ERR_STUPID_PARAMS;
@@ -793,17 +793,22 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
}

// Indexes sanity checked above
#ifdef __GNUC__
cl_device_id TempDeviceList[num_gpus];
#else
cl_device_id* TempDeviceList = (cl_device_id*)_alloca(entries * sizeof(cl_device_id));
#endif
std::vector<cl_device_id> TempDeviceList(num_gpus, nullptr);

printer::inst()->print_msg(LDEBUG, "Number of OpenCL GPUs %d", entries);
for(int i = 0; i < num_gpus; ++i)
{
ctx[i].DeviceID = DeviceIDList[ctx[i].deviceIdx];
TempDeviceList[i] = DeviceIDList[ctx[i].deviceIdx];
}

cl_context opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList.data(), NULL, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
return ERR_OCL_API;
}

const char *fastIntMathV2CL =
#include "./opencl/fast_int_math_v2.cl"
;
@@ -847,22 +852,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)

std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);

std::vector<cl_context> context_vec(entries, nullptr);
for(int i = 0; i < num_gpus; ++i)
{
if(context_vec[ctx[i].deviceIdx] == nullptr)
{
context_vec[ctx[i].deviceIdx] = clCreateContext(NULL, 1, &(ctx[i].DeviceID), NULL, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
return ERR_OCL_API;
}
}
}

for(int i = 0; i < num_gpus; ++i)
{
printer::inst()->print_msg(LDEBUG,"OpenCL Init device %d", ctx[i].deviceIdx);
const size_t devIdx = ctx[i].deviceIdx;
if(interleaveData.size() <= devIdx)
{
@@ -879,7 +871,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
ctx[i].interleaveData = interleaveData[devIdx];
ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
ctx[i].opencl_ctx = context_vec[ctx[i].deviceIdx];
ctx[i].opencl_ctx = opencl_ctx;

if((ret = InitOpenCLGpu(ctx->opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
{
33 changes: 24 additions & 9 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
@@ -30,6 +30,7 @@ R"===(
#define cryptonight_superfast 12
#define cryptonight_gpu 13
#define cryptonight_conceal 14
#define cryptonight_v8_reversewaltz 17

/* For Mesa clover support */
#ifdef cl_clang_storage_class_specifiers
@@ -639,7 +640,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
R"===(

// __NV_CL_C_VERSION checks if NVIDIA opencl is used
#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
#else
@@ -659,7 +660,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
float4 conc_var = (float4)(0.0f);
#endif

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
ulong b[4];
uint4 b_x[2];
// NVIDIA
@@ -673,7 +674,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#endif
__local uint AES0[256], AES1[256];

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
# if defined(__clang__) && !defined(__NV_CL_C_VERSION)
__local uint RCP[256];
# endif
@@ -689,7 +690,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);

#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
RCP[i] = RCP_C[i];
#endif
}
@@ -723,7 +724,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states

b_x[0] = ((uint4 *)b)[0];

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
a[1] = states[1] ^ states[5];
b[2] = states[8] ^ states[10];
b[3] = states[9] ^ states[11];
@@ -755,7 +756,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
{
ulong c[2];

#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
uint idxS = idx0 & 0x30U;
*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
#endif
@@ -792,6 +793,15 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
#elif(ALGO == cryptonight_v8_reversewaltz)
{
ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1));
ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
#endif

#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
@@ -807,7 +817,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
SCRATCHPAD_CHUNK(0) = b_x[0];
idx0 = as_uint2(c[0]).s0 & MASK;

#elif(ALGO == cryptonight_monero_v8)
#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
# ifdef __NV_CL_C_VERSION
// flush shuffled data
@@ -826,7 +836,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
uint4 tmp;
tmp = SCRATCHPAD_CHUNK(0);

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
// Use division and square root results from the _previous_ iteration to hide the latency
tmp.s0 ^= division_result.s0;
tmp.s1 ^= division_result.s1 ^ sqrt_result;
@@ -853,8 +863,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
result_mul ^= chunk2;
ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
#if(ALGO == cryptonight_v8_reversewaltz)
SCRATCHPAD_CHUNK(1) = as_uint4(chunk1 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk3 + ((ulong2 *)b_x)[0]);
#else
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
#endif
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
a[0] += result_mul.s0;
a[1] += result_mul.s1;
@@ -882,7 +897,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states

((uint4 *)a)[0] ^= tmp;

#if (ALGO == cryptonight_monero_v8)
#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
# if defined(__NV_CL_C_VERSION)
// flush shuffled data
SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@ R"===(
* @author SChernykh
*/

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)

static const __constant uint RCP_C[256] =
{
10 changes: 5 additions & 5 deletions xmrstak/backend/amd/config.tpl
Original file line number Diff line number Diff line change
@@ -6,26 +6,26 @@ R"===(// generated by XMRSTAK_VERSION
* intensity - Number of parallel GPU threads (nothing to do with CPU threads)
* worksize - Number of local GPU threads (nothing to do with CPU threads)
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
* strided_index - switch memory pattern used for the scratch pad memory
* strided_index - switch memory pattern used for the scratchpad memory
* 3 = chunked memory, chunk size based on the 'worksize'
* required: intensity must be a multiple of worksize
* 2 = chunked memory, chunk size is controlled by 'mem_chunk'
* required: intensity must be a multiple of worksize
* 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
* 1 or true = use 16 byte contiguous memory per thread, the next memory block has offset of intensity blocks
* (for cryptonight_v8 and monero it is equal to strided_index = 0)
* 0 or false = use a contiguous block of memory per thread
* mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk
* this value is only used if 'strided_index' == 2
* element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
* element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256 byte)
* unroll - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two.
* comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows
* to use a intensity which is not the multiple of the worksize.
* to use an intensity which is not the multiple of the worksize.
* If you set false and the intensity is not multiple of the worksize the miner can crash:
* in this case set the intensity to a multiple of the worksize or activate comp_mode.
* interleave - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
* This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
* 0 = disable thread interleaving
* 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
* 40 = each working thread waits until 40% of the hash calculation of the previously started thread is finished
* "gpu_threads_conf" :
* [
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
9 changes: 4 additions & 5 deletions xmrstak/backend/amd/minethd.cpp
Original file line number Diff line number Diff line change
@@ -186,8 +186,7 @@ void minethd::work_main()

cpu::minethd::cn_on_new_job set_job;

cn_hash_fun hash_fun;
cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);

uint8_t version = 0;
size_t lastPoolId = 0;
@@ -228,12 +227,12 @@ void minethd::work_main()
if(new_version >= coinDesc.GetMiningForkVersion())
{
miner_algo = coinDesc.GetMiningAlgo();
cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
}
else
{
miner_algo = coinDesc.GetMiningAlgoRoot();
cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
}
lastPoolId = oWork.iPoolId;
version = new_version;
@@ -282,7 +281,7 @@ void minethd::work_main()

*(uint32_t*)(bWorkBlob + 39) = results[i];

hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
else
Loading

0 comments on commit 7591274

Please sign in to comment.