From b6d27972f27aa556e00458000684ebf5b3f9350a Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 1 Dec 2019 11:44:17 +0100 Subject: [PATCH 1/3] Combined hash and fill AES loop Adds more parallelizm into AES loop so modern CPUs can take advantage of it. Also, scratchpad data moves between L1 and L3 caches only one time which saves time and energy per hash. --- src/aes_hash.cpp | 81 +++++++++++++++++++++++++++++++++++++++++ src/aes_hash.hpp | 3 ++ src/intrin_portable.h | 7 ++++ src/randomx.cpp | 17 +++++++++ src/randomx.h | 16 ++++++++ src/tests/benchmark.cpp | 9 ++++- src/virtual_machine.cpp | 6 +++ src/virtual_machine.hpp | 2 + 8 files changed, 139 insertions(+), 2 deletions(-) diff --git a/src/aes_hash.cpp b/src/aes_hash.cpp index 1aff37fb..a3b7395b 100644 --- a/src/aes_hash.cpp +++ b/src/aes_hash.cpp @@ -239,3 +239,84 @@ void fillAes4Rx4(void *state, size_t outputSize, void *buffer) { template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state) { + uint8_t* scratchpadPtr = (uint8_t*)scratchpad; + const uint8_t* scratchpadEnd = scratchpadPtr + scratchpadSize; + + // initial state + rx_vec_i128 hash_state0 = rx_set_int_vec_i128(AES_HASH_1R_STATE0); + rx_vec_i128 hash_state1 = rx_set_int_vec_i128(AES_HASH_1R_STATE1); + rx_vec_i128 hash_state2 = rx_set_int_vec_i128(AES_HASH_1R_STATE2); + rx_vec_i128 hash_state3 = rx_set_int_vec_i128(AES_HASH_1R_STATE3); + + const rx_vec_i128 key0 = rx_set_int_vec_i128(AES_GEN_1R_KEY0); + const rx_vec_i128 key1 = rx_set_int_vec_i128(AES_GEN_1R_KEY1); + const rx_vec_i128 key2 = rx_set_int_vec_i128(AES_GEN_1R_KEY2); + const rx_vec_i128 key3 = rx_set_int_vec_i128(AES_GEN_1R_KEY3); + + rx_vec_i128 fill_state0 = rx_load_vec_i128((rx_vec_i128*)fill_state + 0); + rx_vec_i128 fill_state1 = rx_load_vec_i128((rx_vec_i128*)fill_state + 1); + rx_vec_i128 fill_state2 = rx_load_vec_i128((rx_vec_i128*)fill_state + 2); + rx_vec_i128 fill_state3 = rx_load_vec_i128((rx_vec_i128*)fill_state + 3); + + constexpr int PREFETCH_DISTANCE = 4096; + const char* prefetchPtr = ((const char*)scratchpad) + PREFETCH_DISTANCE; + scratchpadEnd -= PREFETCH_DISTANCE; + + for (int i = 0; i < 2; ++i) { + //process 64 bytes at a time in 4 lanes + while (scratchpadPtr < scratchpadEnd) { + hash_state0 = aesenc(hash_state0, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 0)); + hash_state1 = aesdec(hash_state1, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 1)); + hash_state2 = aesenc(hash_state2, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 2)); + hash_state3 = aesdec(hash_state3, rx_load_vec_i128((rx_vec_i128*)scratchpadPtr + 3)); + + fill_state0 = aesdec(fill_state0, key0); + fill_state1 = aesenc(fill_state1, key1); + fill_state2 = aesdec(fill_state2, key2); + fill_state3 = aesenc(fill_state3, key3); + + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + 3, fill_state3); + + rx_prefetch_t0(prefetchPtr); + + scratchpadPtr += 64; + prefetchPtr += 64; + } + prefetchPtr = (const char*) scratchpad; + scratchpadEnd += PREFETCH_DISTANCE; + } + + rx_store_vec_i128((rx_vec_i128*)fill_state + 0, fill_state0); + rx_store_vec_i128((rx_vec_i128*)fill_state + 1, fill_state1); + rx_store_vec_i128((rx_vec_i128*)fill_state + 2, fill_state2); + rx_store_vec_i128((rx_vec_i128*)fill_state + 3, fill_state3); + + //two extra rounds to achieve full diffusion + rx_vec_i128 xkey0 = rx_set_int_vec_i128(AES_HASH_1R_XKEY0); + rx_vec_i128 xkey1 = rx_set_int_vec_i128(AES_HASH_1R_XKEY1); + + hash_state0 = aesenc(hash_state0, xkey0); + hash_state1 = aesdec(hash_state1, xkey0); + hash_state2 = aesenc(hash_state2, xkey0); + hash_state3 = aesdec(hash_state3, xkey0); + + hash_state0 = aesenc(hash_state0, xkey1); + hash_state1 = aesdec(hash_state1, xkey1); + hash_state2 = aesenc(hash_state2, xkey1); + hash_state3 = aesdec(hash_state3, xkey1); + + //output hash + rx_store_vec_i128((rx_vec_i128*)hash + 0, hash_state0); + rx_store_vec_i128((rx_vec_i128*)hash + 1, hash_state1); + rx_store_vec_i128((rx_vec_i128*)hash + 2, hash_state2); + rx_store_vec_i128((rx_vec_i128*)hash + 3, hash_state3); +} + +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); +template void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/aes_hash.hpp b/src/aes_hash.hpp index b4d0e940..9f75f73a 100644 --- a/src/aes_hash.hpp +++ b/src/aes_hash.hpp @@ -38,3 +38,6 @@ void fillAes1Rx4(void *state, size_t outputSize, void *buffer); template void fillAes4Rx4(void *state, size_t outputSize, void *buffer); + +template +void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, void* fill_state); diff --git a/src/intrin_portable.h b/src/intrin_portable.h index b5ad91a8..c9d4475a 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -102,6 +102,7 @@ typedef __m128d rx_vec_f128; #define rx_aligned_alloc(a, b) _mm_malloc(a,b) #define rx_aligned_free(a) _mm_free(a) #define rx_prefetch_nta(x) _mm_prefetch((const char *)(x), _MM_HINT_NTA) +#define rx_prefetch_t0(x) _mm_prefetch((const char *)(x), _MM_HINT_T0) #define rx_load_vec_f128 _mm_load_pd #define rx_store_vec_f128 _mm_store_pd @@ -201,6 +202,7 @@ typedef union{ #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) /* Splat 64-bit long long to 2 64-bit long longs */ FORCE_INLINE __m128i vec_splat2sd (int64_t scalar) @@ -399,6 +401,10 @@ inline void rx_prefetch_nta(void* ptr) { asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); } +inline void rx_prefetch_t0(const void* ptr) { + asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr)); +} + FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { return vld1q_f64((const float64_t*)pd); } @@ -532,6 +538,7 @@ typedef union { #define rx_aligned_alloc(a, b) malloc(a) #define rx_aligned_free(a) free(a) #define rx_prefetch_nta(x) +#define rx_prefetch_t0(x) FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) { rx_vec_f128 x; diff --git a/src/randomx.cpp b/src/randomx.cpp index 90fc46a7..79aba1f0 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -363,4 +363,21 @@ extern "C" { machine->getFinalResult(output, RANDOMX_HASH_SIZE); } + void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) { + blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + machine->initScratchpad(tempHash); + } + + void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) { + machine->resetRoundingMode(); + for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { + machine->run(&tempHash); + blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + } + machine->run(&tempHash); + + // Finish current hash and fill the scratchpad for the next hash at the same time + blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0); + machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash); + } } diff --git a/src/randomx.h b/src/randomx.h index c06002bb..3e41b3ac 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -30,6 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RANDOMX_H #include +#include #define RANDOMX_HASH_SIZE 32 #define RANDOMX_DATASET_ITEM_SIZE 64 @@ -238,6 +239,21 @@ RANDOMX_EXPORT void randomx_destroy_vm(randomx_vm *machine); */ RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *input, size_t inputSize, void *output); +/** + * Paired functions used to calculate multiple RandomX hashes during mining for example. + * + * @param machine is a pointer to a randomx_vm structure. Must not be NULL. + * @param tempHash an array of 8 64-bit values used to store intermediate data between calls to randomx_calculate_hash_first and randomx_calculate_hash_next. + * @param input is a pointer to memory to be hashed. Must not be NULL. + * @param inputSize is the number of bytes to be hashed. + * @param nextInput is a pointer to memory to be hashed for the next hash. Must not be NULL. + * @param nextInputSize is the number of bytes to be hashed for the next hash. + * @param output is a pointer to memory where the hash will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize); +RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output); + #if defined(__cplusplus) } #endif diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index 03023ab1..868dd288 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -122,11 +122,16 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result void* noncePtr = blockTemplate + 39; auto nonce = atomicNonce.fetch_add(1); + uint64_t tempHash[8]; + + store32(noncePtr, nonce); + randomx_calculate_hash_first(vm, tempHash, blockTemplate, sizeof(blockTemplate)); + while (nonce < noncesCount) { + nonce = atomicNonce.fetch_add(1); store32(noncePtr, nonce); - randomx_calculate_hash(vm, blockTemplate, sizeof(blockTemplate), &hash); + randomx_calculate_hash_next(vm, tempHash, blockTemplate, sizeof(blockTemplate), &hash); result.xorWith(hash); - nonce = atomicNonce.fetch_add(1); } } diff --git a/src/virtual_machine.cpp b/src/virtual_machine.cpp index d73a0247..0fdc42d2 100644 --- a/src/virtual_machine.cpp +++ b/src/virtual_machine.cpp @@ -120,6 +120,12 @@ namespace randomx { blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); } + template + void VmBase::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) { + hashAndFillAes1Rx4((void*) getScratchpad(), ScratchpadSize, ®.a, fill_state); + blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); + } + template void VmBase::initScratchpad(void* seed) { fillAes1Rx4(seed, ScratchpadSize, scratchpad); diff --git a/src/virtual_machine.hpp b/src/virtual_machine.hpp index d662c895..89f43209 100644 --- a/src/virtual_machine.hpp +++ b/src/virtual_machine.hpp @@ -38,6 +38,7 @@ class randomx_vm { virtual ~randomx_vm() = 0; virtual void allocate() = 0; virtual void getFinalResult(void* out, size_t outSize) = 0; + virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0; virtual void setDataset(randomx_dataset* dataset) { } virtual void setCache(randomx_cache* cache) { } virtual void initScratchpad(void* seed) = 0; @@ -78,6 +79,7 @@ namespace randomx { void allocate() override; void initScratchpad(void* seed) override; void getFinalResult(void* out, size_t outSize) override; + void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override; protected: void generateProgram(void* seed); }; From a76ac019b4d645ece73f494fd2de6c00ed48cf83 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 1 Dec 2019 12:58:58 +0100 Subject: [PATCH 2/3] Removed C++ code from C API --- src/randomx.cpp | 4 ++-- src/randomx.h | 4 ++-- src/virtual_machine.cpp | 2 +- src/virtual_machine.hpp | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/randomx.cpp b/src/randomx.cpp index 79aba1f0..dd5f4475 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -363,12 +363,12 @@ extern "C" { machine->getFinalResult(output, RANDOMX_HASH_SIZE); } - void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize) { + void randomx_calculate_hash_first(randomx_vm* machine, uint64_t *tempHash, const void* input, size_t inputSize) { blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); machine->initScratchpad(tempHash); } - void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output) { + void randomx_calculate_hash_next(randomx_vm* machine, uint64_t *tempHash, const void* nextInput, size_t nextInputSize, void* output) { machine->resetRoundingMode(); for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { machine->run(&tempHash); diff --git a/src/randomx.h b/src/randomx.h index 3e41b3ac..4a7fcbf3 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -251,8 +251,8 @@ RANDOMX_EXPORT void randomx_calculate_hash(randomx_vm *machine, const void *inpu * @param output is a pointer to memory where the hash will be stored. Must not * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. */ -RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t (&tempHash)[8], const void* input, size_t inputSize); -RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t (&tempHash)[8], const void* nextInput, size_t nextInputSize, void* output); +RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, uint64_t *tempHash, const void* input, size_t inputSize); +RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, uint64_t *tempHash, const void* nextInput, size_t nextInputSize, void* output); #if defined(__cplusplus) } diff --git a/src/virtual_machine.cpp b/src/virtual_machine.cpp index 0fdc42d2..2d5d2bea 100644 --- a/src/virtual_machine.cpp +++ b/src/virtual_machine.cpp @@ -121,7 +121,7 @@ namespace randomx { } template - void VmBase::hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) { + void VmBase::hashAndFill(void* out, size_t outSize, uint64_t *fill_state) { hashAndFillAes1Rx4((void*) getScratchpad(), ScratchpadSize, ®.a, fill_state); blake2b(out, outSize, ®, sizeof(RegisterFile), nullptr, 0); } diff --git a/src/virtual_machine.hpp b/src/virtual_machine.hpp index 89f43209..4e89366c 100644 --- a/src/virtual_machine.hpp +++ b/src/virtual_machine.hpp @@ -38,7 +38,7 @@ class randomx_vm { virtual ~randomx_vm() = 0; virtual void allocate() = 0; virtual void getFinalResult(void* out, size_t outSize) = 0; - virtual void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) = 0; + virtual void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) = 0; virtual void setDataset(randomx_dataset* dataset) { } virtual void setCache(randomx_cache* cache) { } virtual void initScratchpad(void* seed) = 0; @@ -79,7 +79,7 @@ namespace randomx { void allocate() override; void initScratchpad(void* seed) override; void getFinalResult(void* out, size_t outSize) override; - void hashAndFill(void* out, size_t outSize, uint64_t (&fill_state)[8]) override; + void hashAndFill(void* out, size_t outSize, uint64_t *fill_state) override; protected: void generateProgram(void* seed); }; From 5f053881df3f975717498c66de70a2ddf93e5822 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 1 Dec 2019 13:34:11 +0100 Subject: [PATCH 3/3] Fixed incorrect sizeof --- src/randomx.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/randomx.cpp b/src/randomx.cpp index dd5f4475..c4281b3d 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -364,20 +364,20 @@ extern "C" { } void randomx_calculate_hash_first(randomx_vm* machine, uint64_t *tempHash, const void* input, size_t inputSize) { - blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); + blake2b(tempHash, sizeof(uint64_t) * 8, input, inputSize, nullptr, 0); machine->initScratchpad(tempHash); } void randomx_calculate_hash_next(randomx_vm* machine, uint64_t *tempHash, const void* nextInput, size_t nextInputSize, void* output) { machine->resetRoundingMode(); for (uint32_t chain = 0; chain < RANDOMX_PROGRAM_COUNT - 1; ++chain) { - machine->run(&tempHash); - blake2b(tempHash, sizeof(tempHash), machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); + machine->run(tempHash); + blake2b(tempHash, sizeof(uint64_t) * 8, machine->getRegisterFile(), sizeof(randomx::RegisterFile), nullptr, 0); } - machine->run(&tempHash); + machine->run(tempHash); // Finish current hash and fill the scratchpad for the next hash at the same time - blake2b(tempHash, sizeof(tempHash), nextInput, nextInputSize, nullptr, 0); + blake2b(tempHash, sizeof(uint64_t) * 8, nextInput, nextInputSize, nullptr, 0); machine->hashAndFill(output, RANDOMX_HASH_SIZE, tempHash); } }