From 6f2839280e1e842722516d6fcb85de505f23f473 Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Mon, 19 Nov 2018 20:38:07 +0100 Subject: [PATCH] OpenCL: reduce local mem footprint Reduce local memory foot print to increase the occupancy. Co-authored-by: SChernykh --- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 8 +-- .../backend/amd/amd_gpu/opencl/wolf-aes.cl | 59 +++++++++++-------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 81c0d5ff9..ec46ebc1b 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -562,7 +562,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong b[2]; uint4 b_x[1]; #endif - __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + __local uint AES0[256], AES1[256]; // cryptonight_monero_v8 #if(ALGO==11) @@ -577,8 +577,6 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states const uint tmp = AES0_C[i]; AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); // cryptonight_monero_v8 #if(ALGO==11) RCP[i] = RCP_C[i]; @@ -653,9 +651,9 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) - ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + ((uint4 *)c)[0] = AES_Round2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]); #else - ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + ((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif // cryptonight_monero_v8 diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl index 50e861e23..c3125d90a 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl @@ -74,42 +74,49 @@ static const __constant uint AES0_C[256] = #define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) -inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, uint4 x, uint4 k) -{ - x = ~x; - k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ AES2[BYTE(x.s2, 2)] ^ AES3[BYTE(x.s3, 3)]; - x.s0 ^= k.s0; - k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ AES2[BYTE(x.s3, 2)] ^ AES3[BYTE(x.s0, 3)]; - x.s1 ^= k.s1; - k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ AES2[BYTE(x.s0, 2)] ^ AES3[BYTE(x.s1, 3)]; - x.s2 ^= k.s2; - k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ AES2[BYTE(x.s1, 2)] ^ AES3[BYTE(x.s2, 3)]; - return k; -} - uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key) { key.s0 ^= AES0[BYTE(X.s0, 0)]; - key.s1 ^= AES0[BYTE(X.s1, 0)]; - key.s2 ^= AES0[BYTE(X.s2, 0)]; - key.s3 ^= AES0[BYTE(X.s3, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; key.s0 ^= AES2[BYTE(X.s2, 2)]; - key.s1 ^= AES2[BYTE(X.s3, 2)]; - key.s2 ^= AES2[BYTE(X.s0, 2)]; - key.s3 ^= AES2[BYTE(X.s1, 2)]; + key.s1 ^= AES2[BYTE(X.s3, 2)]; + key.s2 ^= AES2[BYTE(X.s0, 2)]; + key.s3 ^= AES2[BYTE(X.s1, 2)]; key.s0 ^= AES1[BYTE(X.s1, 1)]; - key.s1 ^= AES1[BYTE(X.s2, 1)]; - key.s2 ^= AES1[BYTE(X.s3, 1)]; - key.s3 ^= AES1[BYTE(X.s0, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; key.s0 ^= AES3[BYTE(X.s3, 3)]; - key.s1 ^= AES3[BYTE(X.s0, 3)]; - key.s2 ^= AES3[BYTE(X.s1, 3)]; - key.s3 ^= AES3[BYTE(X.s2, 3)]; + key.s1 ^= AES3[BYTE(X.s0, 3)]; + key.s2 ^= AES3[BYTE(X.s1, 3)]; + key.s3 ^= AES3[BYTE(X.s2, 3)]; + + return key; +} + +uint4 AES_Round2(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key) +{ + key.s0 ^= AES0[BYTE(X.s0, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; + + key.s0 ^= rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u); + key.s1 ^= rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u); + key.s2 ^= rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u); + key.s3 ^= rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u); + + key.s0 ^= AES1[BYTE(X.s1, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; - return key; + return key; } #endif