fireice-uk · fireice-uk · Mar 7, 2019 · Feb 21, 2019 · Mar 4, 2019 · Mar 4, 2019
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -508,18 +508,23 @@ if(CMAKE_C_COMPILER_ID MATCHES "MSVC")
     # asm optimized monero v8 code
     enable_language(ASM_MASM)
     set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm" PROPERTY ASM_MASM)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.asm" PROPERTY ASM_MASM)
     add_library(xmr-stak-asm
         STATIC
         "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.asm"
+        "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.asm"
     )
 else()
     # asm optimized monero v8 code
     enable_language(ASM)
     set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTY CPP)
+    set_property(SOURCE "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S" PROPERTY CPP)
     set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
+    set_source_files_properties("xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S" PROPERTIES COMPILE_FLAGS "-x assembler-with-cpp")
     add_library(xmr-stak-asm
         STATIC
         "xmrstak/backend/cpu/crypto/asm/cryptonight_v8_main_loop.S"
+        "xmrstak/backend/cpu/crypto/asm/cnR/CryptonightR_template.S"
     )
 endif()
 

diff --git a/README.md b/README.md
@@ -71,6 +71,7 @@ If your prefered coin is not listed, you can choose one of the following algorit
     - cryptonight_v7_stellite
     - cryptonight_v8
     - cryptonight_v8_half (used by masari and stellite)
+    - cryptonight_v8_reversewaltz (used by graft)
     - cryptonight_v8_zelerius
 - 4MiB scratchpad memory
     - cryptonight_haven

diff --git a/xmrstak/backend/amd/OclCryptonightR_gen.cpp b/xmrstak/backend/amd/OclCryptonightR_gen.cpp
@@ -282,7 +282,7 @@ cl_program CryptonightR_get_program(GpuContext* ctx, xmrstak_algo algo, uint64_t
         code_size = v4_random_math_init<cryptonight_r>(code, height);
         break;
     default:
-        printer::inst()->print_msg(LDEBUG, "CryptonightR_get_program: invalid algo %d", algo);
+        printer::inst()->print_msg(L0, "CryptonightR_get_program: invalid algo %d", algo);
         return nullptr;
     }
 

diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -294,7 +294,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		 * this is required if the dev pool is mining monero
 		 * but the user tuned there settings for another currency
 		 */
-		if(miner_algo == cryptonight_monero_v8)
+		if(miner_algo == cryptonight_monero_v8 || miner_algo == cryptonight_v8_reversewaltz)
 		{
 			if(ctx->memChunk < 2)
 				mem_chunk_exp = 1u << 2;
@@ -774,7 +774,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	// Same as the platform index sanity check, except we must check all requested device indexes
 	for(int i = 0; i < num_gpus; ++i)
 	{
-		if(entries <= ctx[i].deviceIdx)
+		if(ctx[i].deviceIdx >= entries)
 		{
 			printer::inst()->print_msg(L1,"Selected OpenCL device index %lu doesn't exist.\n", ctx[i].deviceIdx);
 			return ERR_STUPID_PARAMS;
@@ -793,17 +793,22 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	}
 
 	// Indexes sanity checked above
-#ifdef __GNUC__
-	cl_device_id TempDeviceList[num_gpus];
-#else
-	cl_device_id* TempDeviceList = (cl_device_id*)_alloca(entries * sizeof(cl_device_id));
-#endif
+	std::vector<cl_device_id> TempDeviceList(num_gpus, nullptr);
+
+	printer::inst()->print_msg(LDEBUG, "Number of OpenCL GPUs %d", entries);
 	for(int i = 0; i < num_gpus; ++i)
 	{
 		ctx[i].DeviceID = DeviceIDList[ctx[i].deviceIdx];
 		TempDeviceList[i] = DeviceIDList[ctx[i].deviceIdx];
 	}
 
+	cl_context opencl_ctx = clCreateContext(NULL, num_gpus, TempDeviceList.data(), NULL, NULL, &ret);
+	if(ret != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
+		return ERR_OCL_API;
+	}
+
 	const char *fastIntMathV2CL =
 			#include "./opencl/fast_int_math_v2.cl"
 	;
@@ -847,22 +852,9 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 
 	std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);
 
-	std::vector<cl_context> context_vec(entries, nullptr);
-	for(int i = 0; i < num_gpus; ++i)
-	{
-		if(context_vec[ctx[i].deviceIdx] == nullptr)
-		{
-			context_vec[ctx[i].deviceIdx] = clCreateContext(NULL, 1, &(ctx[i].DeviceID), NULL, NULL, &ret);
-			if(ret != CL_SUCCESS)
-			{
-				printer::inst()->print_msg(L1,"Error %s when calling clCreateContext.", err_to_str(ret));
-				return ERR_OCL_API;
-			}
-		}
-	}
-
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		printer::inst()->print_msg(LDEBUG,"OpenCL Init device %d", ctx[i].deviceIdx);
 		const size_t devIdx = ctx[i].deviceIdx;
 		if(interleaveData.size() <= devIdx)
 		{
@@ -879,7 +871,7 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 		ctx[i].interleaveData = interleaveData[devIdx];
 		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
 		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
-		ctx[i].opencl_ctx = context_vec[ctx[i].deviceIdx];
+		ctx[i].opencl_ctx = opencl_ctx;
 
 		if((ret = InitOpenCLGpu(ctx->opencl_ctx, &ctx[i], source_code.c_str())) != ERR_SUCCESS)
 		{

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -30,6 +30,7 @@ R"===(
 #define cryptonight_superfast 12
 #define cryptonight_gpu 13
 #define cryptonight_conceal 14
+#define cryptonight_v8_reversewaltz 17
 
 /* For Mesa clover support */
 #ifdef cl_clang_storage_class_specifiers
@@ -639,7 +640,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 R"===(
 
 // __NV_CL_C_VERSION checks if NVIDIA opencl is used
-#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
 #	define SCRATCHPAD_CHUNK_GLOBAL (*((__global uint16*)(Scratchpad + (IDX((idx0 & 0x1FFFC0U) >> 4)))))
 #else
@@ -659,7 +660,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	float4 conc_var = (float4)(0.0f);
 #endif
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 	ulong b[4];
 	uint4 b_x[2];
 // NVIDIA
@@ -673,7 +674,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 	__local uint AES0[256], AES1[256];
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 #	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
 	__local uint RCP[256];
 #	endif
@@ -689,7 +690,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		AES0[i] = tmp;
 		AES1[i] = rotate(tmp, 8U);
 
-#if(ALGO == cryptonight_monero_v8 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
 	}
@@ -723,7 +724,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 		b_x[0] = ((uint4 *)b)[0];
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 		a[1] = states[1] ^ states[5];
 		b[2] = states[8] ^ states[10];
 		b[3] = states[9] ^ states[11];
@@ -755,7 +756,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	{
 			ulong c[2];
 
-#if(ALGO == cryptonight_monero_v8 && defined(__NV_CL_C_VERSION))
+#if((ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz) && defined(__NV_CL_C_VERSION))
 			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
@@ -792,6 +793,15 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
 			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 		}
+#elif(ALGO == cryptonight_v8_reversewaltz)
+		{
+			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(1));
+			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(3));
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
+		}
 #endif
 
 #if(ALGO == cryptonight_monero || ALGO == cryptonight_aeon || ALGO == cryptonight_ipbc || ALGO == cryptonight_stellite || ALGO == cryptonight_masari || ALGO == cryptonight_bittube2)
@@ -807,7 +817,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			SCRATCHPAD_CHUNK(0) = b_x[0];
 			idx0 = as_uint2(c[0]).s0 & MASK;
 
-#elif(ALGO == cryptonight_monero_v8)
+#elif(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
@@ -826,7 +836,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 			// Use division and square root results from the _previous_ iteration to hide the latency
 			tmp.s0 ^= division_result.s0;
 			tmp.s1 ^= division_result.s1 ^ sqrt_result;
@@ -853,8 +863,13 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
 			result_mul ^= chunk2;
 			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+#if(ALGO == cryptonight_v8_reversewaltz)
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk1 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk3 + ((ulong2 *)b_x)[0]);
+#else
 			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
 			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+#endif
 			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 			a[0] += result_mul.s0;
 			a[1] += result_mul.s1;
@@ -882,7 +897,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
 		((uint4 *)a)[0] ^= tmp;
 
-#if (ALGO == cryptonight_monero_v8)
+#if (ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 #	if defined(__NV_CL_C_VERSION)
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;

diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -3,7 +3,7 @@ R"===(
  * @author SChernykh
  */
 
-#if(ALGO == cryptonight_monero_v8)
+#if(ALGO == cryptonight_monero_v8 || ALGO == cryptonight_v8_reversewaltz)
 
 static const __constant uint RCP_C[256] =
 {

diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
@@ -6,26 +6,26 @@ R"===(// generated by XMRSTAK_VERSION
  * intensity     - Number of parallel GPU threads (nothing to do with CPU threads)
  * worksize      - Number of local GPU threads (nothing to do with CPU threads)
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
- * strided_index - switch memory pattern used for the scratch pad memory
+ * strided_index - switch memory pattern used for the scratchpad memory
  *                 3 = chunked memory, chunk size based on the 'worksize'
  *                     required: intensity must be a multiple of worksize
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
- *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
+ *                 1 or true  = use 16 byte contiguous memory per thread, the next memory block has offset of intensity blocks
  *                             (for cryptonight_v8 and monero it is equal to strided_index = 0)
  *                 0 or false = use a contiguous block of memory per thread
  * mem_chunk     - range 0 to 18: set the number of elements (16byte) per chunk
  *                 this value is only used if 'strided_index' == 2
- *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256byte)
+ *                 element count is computed with the equation: 2 to the power of 'mem_chunk' e.g. 4 means a chunk of 16 elements(256 byte)
  * unroll        - allow to control how often the POW main loop is unrolled; valid range [1;128) - for most OpenCL implementations it must be a power of two.
  * comp_mode     - Compatibility enable/disable the automatic guard around compute kernel which allows
- *                 to use a intensity which is not the multiple of the worksize.
+ *                 to use an intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
  * interleave    - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
  *                 This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
  *                 0  = disable thread interleaving
- *                 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
+ *                 40 = each working thread waits until 40% of the hash calculation of the previously started thread is finished
  * "gpu_threads_conf" :
  * [
  *     { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,

diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
@@ -186,8 +186,7 @@ void minethd::work_main()
 
 	cpu::minethd::cn_on_new_job set_job;
 
-	cn_hash_fun hash_fun;
-	cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+	cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
@@ -228,12 +227,12 @@ void minethd::work_main()
 			if(new_version >= coinDesc.GetMiningForkVersion())
 			{
 				miner_algo = coinDesc.GetMiningAlgo();
-				cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+				cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			else
 			{
 				miner_algo = coinDesc.GetMiningAlgoRoot();
-				cpu::minethd::func_multi_selector<1>(hash_fun, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
+				cpu::minethd::func_multi_selector<1>(&cpu_ctx, set_job, ::jconf::inst()->HaveHardwareAes(), true /*bNoPrefetch*/, miner_algo);
 			}
 			lastPoolId = oWork.iPoolId;
 			version = new_version;
@@ -282,7 +281,7 @@ void minethd::work_main()
 
 				*(uint32_t*)(bWorkBlob + 39) = results[i];
 
-				hash_fun(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
+				cpu_ctx->hash_fn(bWorkBlob, oWork.iWorkSize, bResult, &cpu_ctx, miner_algo);
 				if ( (*((uint64_t*)(bResult + 24))) < oWork.iTarget)
 					executor::inst()->push_event(ex_event(job_result(oWork.sJobID, results[i], bResult, iThreadNo, miner_algo), oWork.iPoolId));
 				else