fancyIX · fancyIX · Mar 12, 2022 · Mar 12, 2022 · Mar 12, 2022 · Mar 15, 2022
diff --git a/.gitignore b/.gitignore
@@ -68,3 +68,4 @@ compat/curl-for-windows/
 *.cudafe1.c
 *.cudafe2.c
 
+*.bak
diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu
@@ -34,18 +34,6 @@ uint32_t ROL16(const uint32_t a) {
 #define ROL16(u) (u << 16)
 #endif
 
-__device__ __forceinline__
-uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c)
-{
-	uint32_t result;
-#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
-#else
-	result = a^b^c;
-#endif
-	return result;
-}
-
 static const uint32_t blake2s_IV[8] = {
 	0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
 	0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
@@ -562,4 +550,3 @@ extern "C" void free_blake2s(int thr_id)
 
 	cudaDeviceSynchronize();
 }
-
diff --git a/Algo256/bmw512.cu b/Algo256/bmw512.cu
@@ -0,0 +1,130 @@
+/**
+ * BMW512
+ */
+extern "C" {
+#include "sph/sph_bmw.h"
+}
+#include "miner.h"
+#include "cuda_helper.h"
+#include <unistd.h>
+
+#define NBN 2
+
+static uint32_t *d_resNonce[MAX_GPUS];
+static uint32_t *h_resNonce[MAX_GPUS];
+extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_resNonce, const uint64_t target);
+
+
+extern "C" void bmw512_hash(void *state, const void *input) {
+	sph_bmw512_context ctx_bmw;
+	unsigned char hash[64];
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, input, 80);
+	sph_bmw512_close(&ctx_bmw, hash);
+	memcpy(state, hash, 32);
+}
+
+
+static bool init[MAX_GPUS] = { 0 };
+
+
+extern "C" int scanhash_bmw512(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) {
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t endiandata[20];
+
+	if (opt_benchmark) ptarget[7] = 0x00ff;
+
+	for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]);
+
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << 28);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id]) {
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)));
+		h_resNonce[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t));
+		if(h_resNonce[thr_id] == NULL){
+			gpulog(LOG_ERR,thr_id,"Host memory allocation failed");
+			exit(EXIT_FAILURE);
+		}
+		quark_bmw512_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	quark_bmw512_cpu_setBlock_80((void*)endiandata);
+	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+	*hashes_done = 0;
+
+	do {
+		quark_bmw512_cpu_hash_80_final(thr_id, throughput, pdata[19], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]);
+		cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		*hashes_done += throughput;
+
+		if (h_resNonce[thr_id][0] != UINT32_MAX) {
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], startNounce + h_resNonce[thr_id][0]);
+			bmw512_hash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[0] = startNounce + h_resNonce[thr_id][0];
+				work_set_target_ratio(work, vhash);
+				if (h_resNonce[thr_id][1] != UINT32_MAX) {
+					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[thr_id][1];
+					be32enc(&endiandata[19], secNonce);
+					bmw512_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xff, 2*sizeof(uint32_t));
+				pdata[19] = startNounce + h_resNonce[thr_id][0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+	return 0;
+}
+
+
+extern "C" void free_bmw512(int thr_id) {
+	if (!init[thr_id]) return;
+
+	cudaSetDevice(device_map[thr_id]);
+
+	free(h_resNonce[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu
@@ -20,12 +20,12 @@ __device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c);
 
 #define UINT2(x,y) make_uint2(x,y)
 
-__device__ __inline__ uint2 ROR8(const uint2 a) {
+/*__device__ __inline__ uint2 ROR8(const uint2 a) {
 	uint2 result;
 	result.x = __byte_perm(a.y, a.x, 0x0765);
 	result.y = __byte_perm(a.x, a.y, 0x0765);
 	return result;
-}
+}*/
 
 static __device__ uint64_t cuda_swab32ll(uint64_t x) {
 	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));

diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu
@@ -32,18 +32,6 @@ __constant__ uint2 keccak_round_constants[24] = {
 	{ 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 },	{ 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 }
 };
 
-__device__ __forceinline__
-uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) {
-	uint2 result;
-#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
-	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
-	asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA
-#else
-	result = a^b^c;
-#endif
-	return result;
-}
-
 __device__ __forceinline__
 uint2 chi(const uint2 a,const uint2 b,const uint2 c) { // keccak chi
 	uint2 result;