diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp index e478600e3..199025635 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp @@ -303,3 +303,9 @@ __device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory) for(int i = threadIdx.x; i < 1024; i += blockDim.x) sharedMemory[i] = d_t_fn[i]; } + +__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory) +{ + for(int i = threadIdx.x; i < 512; i += blockDim.x) + sharedMemory[i] = d_t_fn[i]; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index fa7e09364..2943e67b2 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -121,6 +121,11 @@ __device__ __forceinline__ void storeGlobal64( T* addr, T const & val ) #endif } +__device__ __forceinline__ uint32_t rotate16( const uint32_t n ) +{ + return (n >> 16u) | (n << 16u); +} + template __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) { @@ -267,9 +272,9 @@ __launch_bounds__( XMR_STAK_THREADS * 2 ) __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t sharedMemory[512]; - cn_aes_gpu_init( sharedMemory ); + cn_aes_gpu_init_half( sharedMemory ); #if( __CUDA_ARCH__ < 300 ) extern __shared__ uint64_t externShared[]; @@ -340,8 +345,8 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; u64 cx_aes = ax0 ^ u64( - t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ), - t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) ) + t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )), + t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) )) ); if(ALGO == cryptonight_monero_v8) @@ -840,9 +845,9 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); - + if(miner_algo == invalid_algo) return; - + static const cuda_hash_fn func_table[] = { cryptonight_core_gpu_hash, cryptonight_core_gpu_hash,