Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release 2.10.0 #2285

Merged
merged 18 commits into from
Mar 7, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -508,18 +508,23 @@ if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
# asm optimized monero v8 code
enable_language(ASM_MASM)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.asm" PROPERTY ASM_MASM)
add_library(xmr-stak-asm
STATIC
"xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm"
"xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.asm"
)
else()
# asm optimized monero v8 code
enable_language(ASM)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP)
set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S" PROPERTY CPP)
set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
add_library(xmr-stak-asm
STATIC
"xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
"xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S"
)
endif()

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
- cryptonight_v7_stellite
- cryptonight_v8
- cryptonight_v8_half (used by masari and stellite)
- cryptonight_v8_reversewaltz (used by graft)
- cryptonight_v8_zelerius
- 4MiB scratchpad memory
- cryptonight_haven
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/OclCryptonightR_gen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
code_size = v4_random_math_init<cryptonight_r>(code, height);
break;
default:
printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
return nullptr;
}

Expand Down
36 changes: 14 additions & 22 deletions xmrstak/backend/amd/amd_gpu/gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
* this is required if the dev pool is mining monero
* but the user tuned there settings for another currency
*/
if(miner_algo == cryptonight_monero_v8)
if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_v8_reversewaltz)
{
if(ctx->memChunk < 2)
mem_chunk_exp = 1u << 2;
Expand Down Expand Up @@ -774,7 +774,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
// Same as the platform index sanity check, except we must check all requested device indexes
for(int i = 0; i < num_gpus; ++i)
{
if(entries <= ctx[i].deviceIdx)
if(ctx[i].deviceIdx >= entries)
{
printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
return ERR_STUPID_PARAMS;
Expand All @@ -793,17 +793,22 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
}

// Indexes sanity checked above
#ifdef __GNUC__
cl_device_id TempDeviceList[num_gpus];
#else
cl_device_id* TempDeviceList = (cl_device_id*)_alloca(entries * sizeof(cl_device_id));
#endif
std::vector<cl_device_id> TempDeviceList(num_gpus, nullptr);

printer::inst()->print_msg(LDEBUG, "Number of OpenCL GPUs %d", entries);
for(int i = 0; i < num_gpus; ++i)
{
ctx[i].DeviceID = DeviceIDList[ctx[i].deviceIdx];
TempDeviceList[i] = DeviceIDList[ctx[i].deviceIdx];
}

cl_context opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList.data(), NULL, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
return ERR_OCL_API;
}

const char *fastIntMathV2CL =
#include "./opencl/fast_int_math_v2.cl"
;
Expand Down Expand Up @@ -847,22 +852,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)

std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);

std::vector<cl_context> context_vec(entries, nullptr);
for(int i = 0; i < num_gpus; ++i)
{
if(context_vec[ctx[i].deviceIdx] == nullptr)
{
context_vec[ctx[i].deviceIdx] = clCreateContext(NULL, 1, &(ctx[i].DeviceID), NULL, NULL, &ret);
if(ret != CL_SUCCESS)
{
printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
return ERR_OCL_API;
}
}
}

for(int i = 0; i < num_gpus; ++i)
{
printer::inst()->print_msg(LDEBUG,"OpenCL Init device %d", ctx[i].deviceIdx);
const size_t devIdx = ctx[i].deviceIdx;
if(interleaveData.size() <= devIdx)
{
Expand All @@ -879,7 +871,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
ctx[i].interleaveData = interleaveData[devIdx];
ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
ctx[i].opencl_ctx = context_vec[ctx[i].deviceIdx];
ctx[i].opencl_ctx = opencl_ctx;

if((ret = InitOpenCLGpu(ctx->opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
{
Expand Down
33 changes: 24 additions & 9 deletions xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ R"===(
#define cryptonight_superfast 12
#define cryptonight_gpu 13
#define cryptonight_conceal 14
#define cryptonight_v8_reversewaltz 17

/* For Mesa clover support */
#ifdef cl_clang_storage_class_specifiers
Expand Down Expand Up @@ -639,7 +640,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
R"===(

// __NV_CL_C_VERSION checks if NVIDIA opencl is used
#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
# define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
# define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
#else
Expand All @@ -659,7 +660,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
float4 conc_var = (float4)(0.0f);
#endif

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
ulong b[4];
uint4 b_x[2];
// NVIDIA
Expand All @@ -673,7 +674,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
#endif
__local uint AES0[256], AES1[256];

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
# if defined(__clang__) && !defined(__NV_CL_C_VERSION)
__local uint RCP[256];
# endif
Expand All @@ -689,7 +690,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
AES0[i] = tmp;
AES1[i] = rotate(tmp, 8U);

#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
RCP[i] = RCP_C[i];
#endif
}
Expand Down Expand Up @@ -723,7 +724,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states

b_x[0] = ((uint4 *)b)[0];

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
a[1] = states[1] ^ states[5];
b[2] = states[8] ^ states[10];
b[3] = states[9] ^ states[11];
Expand Down Expand Up @@ -755,7 +756,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
{
ulong c[2];

#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
uint idxS = idx0 & 0x30U;
*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
#endif
Expand Down Expand Up @@ -792,6 +793,15 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
#elif(ALGO == cryptonight_v8_reversewaltz)
{
ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1));
ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(3));
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
}
#endif

#if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
Expand All @@ -807,7 +817,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
SCRATCHPAD_CHUNK(0) = b_x[0];
idx0 = as_uint2(c[0]).s0 & MASK;

#elif(ALGO == cryptonight_monero_v8)
#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
# ifdef __NV_CL_C_VERSION
// flush shuffled data
Expand All @@ -826,7 +836,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
uint4 tmp;
tmp = SCRATCHPAD_CHUNK(0);

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
// Use division and square root results from the _previous_ iteration to hide the latency
tmp.s0 ^= division_result.s0;
tmp.s1 ^= division_result.s1 ^ sqrt_result;
Expand All @@ -853,8 +863,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
result_mul ^= chunk2;
ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
#if(ALGO == cryptonight_v8_reversewaltz)
SCRATCHPAD_CHUNK(1) = as_uint4(chunk1 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk3 + ((ulong2 *)b_x)[0]);
#else
SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
#endif
SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
a[0] += result_mul.s0;
a[1] += result_mul.s1;
Expand Down Expand Up @@ -882,7 +897,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states

((uint4 *)a)[0] ^= tmp;

#if (ALGO == cryptonight_monero_v8)
#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
# if defined(__NV_CL_C_VERSION)
// flush shuffled data
SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
Expand Down
2 changes: 1 addition & 1 deletion xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ R"===(
* @author SChernykh
*/

#if(ALGO == cryptonight_monero_v8)
#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)

static const __constant uint RCP_C[256] =
{
Expand Down
10 changes: 5 additions & 5 deletions xmrstak/backend/amd/config.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -6,26 +6,26 @@ R"===(// generated by XMRSTAK_VERSION
* intensity - Number of parallel GPU threads (nothing to do with CPU threads)
* worksize - Number of local GPU threads (nothing to do with CPU threads)
* affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
* strided_index - switch memory pattern used for the scratch pad memory
* strided_index - switch memory pattern used for the scratchpad memory
* 3 = chunked memory, chunk size based on the 'worksize'
* required: intensity must be a multiple of worksize
* 2 = chunked memory, chunk size is controlled by 'mem_chunk'
* required: intensity must be a multiple of worksize
* 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
* 1 or true = use 16 byte contiguous memory per thread, the next memory block has offset of intensity blocks
* (for cryptonight_v8 and monero it is equal to strided_index = 0)
* 0 or false = use a contiguous block of memory per thread
* mem_chunk - range 0 to 18: set the number of elements (16byte) per chunk
* this value is only used if 'strided_index' == 2
* element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
* element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256 byte)
* unroll - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two.
* comp_mode - Compatibility enable/disable the automatic guard around compute kernel which allows
* to use a intensity which is not the multiple of the worksize.
* to use an intensity which is not the multiple of the worksize.
* If you set false and the intensity is not multiple of the worksize the miner can crash:
* in this case set the intensity to a multiple of the worksize or activate comp_mode.
* interleave - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
* This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
* 0 = disable thread interleaving
* 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
* 40 = each working thread waits until 40% of the hash calculation of the previously started thread is finished
* "gpu_threads_conf" :
* [
* { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
Expand Down
9 changes: 4 additions & 5 deletions xmrstak/backend/amd/minethd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,7 @@ void minethd::work_main()

cpu::minethd::cn_on_new_job set_job;

cn_hash_fun hash_fun;
cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);

uint8_t version = 0;
size_t lastPoolId = 0;
Expand Down Expand Up @@ -228,12 +227,12 @@ void minethd::work_main()
if(new_version >= coinDesc.GetMiningForkVersion())
{
miner_algo = coinDesc.GetMiningAlgo();
cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
}
else
{
miner_algo = coinDesc.GetMiningAlgoRoot();
cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
}
lastPoolId = oWork.iPoolId;
version = new_version;
Expand Down Expand Up @@ -282,7 +281,7 @@ void minethd::work_main()

*(uint32_t*)(bWorkBlob + 39) = results[i];

hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
else
Expand Down
Loading