From ea47136ba6a56795913d7c8e65a3a5b29a5e396a Mon Sep 17 00:00:00 2001 From: fireice-uk Date: Sun, 29 Dec 2019 12:11:55 +0000 Subject: [PATCH] Implement Ryzen speedups --- .../asm/program_read_dataset_ryzen.inc | 18 +++++ .../cpu/crypto/randomx/jit_compiler_x86.cpp | 76 +++++++++++++------ .../cpu/crypto/randomx/jit_compiler_x86.hpp | 19 ++++- .../crypto/randomx/jit_compiler_x86_static.S | 5 ++ .../randomx/jit_compiler_x86_static.asm | 8 +- .../randomx/jit_compiler_x86_static.hpp | 1 + .../backend/cpu/crypto/randomx/randomx.cpp | 9 ++- xmrstak/backend/cpu/crypto/randomx/randomx.h | 5 +- 8 files changed, 113 insertions(+), 28 deletions(-) create mode 100644 xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc diff --git a/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc b/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc new file mode 100644 index 000000000..6bb87c8f9 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc @@ -0,0 +1,18 @@ + mov rcx, rbp ;# ecx = ma + shr rcx, 32 + and ecx, RANDOMX_DATASET_BASE_MASK + xor rbp, rax ;# modify "mx" + mov rax, qword ptr [rdi+rcx] + mov edx, ebp ;# edx = mx + and edx, RANDOMX_DATASET_BASE_MASK + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + xor r8, rax + xor r9, qword ptr [rdi+rcx+8] + xor r10, qword ptr [rdi+rcx+16] + xor r11, qword ptr [rdi+rcx+24] + xor r12, qword ptr [rdi+rcx+32] + xor r13, qword ptr [rdi+rcx+40] + xor r14, qword ptr [rdi+rcx+48] + xor r15, qword ptr [rdi+rcx+56] + \ No newline at end of file diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp index bfde7d002..433d486cf 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp @@ -224,8 +224,6 @@ namespace randomx { {0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, }; - bool JitCompilerX86::BranchesWithin32B = false; - size_t JitCompilerX86::getCodeSize() { return codePos < prologueSize ? 0 : codePos - prologueSize; } @@ -241,8 +239,14 @@ namespace randomx { # endif } + std::atomic JitCompilerX86::flags_set(0); + uint64_t JitCompilerX86::flags = 0; // CPU-specific tweaks void JitCompilerX86::applyTweaks() { + + if(flags_set.fetch_add(1) != 0) + return; + int32_t info[4]; cpuid(0, info); @@ -252,38 +256,42 @@ namespace randomx { manufacturer[2] = info[2]; manufacturer[3] = 0; - if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) { - struct - { - unsigned int stepping : 4; - unsigned int model : 4; - unsigned int family : 4; - unsigned int processor_type : 2; - unsigned int reserved1 : 2; - unsigned int ext_model : 4; - unsigned int ext_family : 8; - unsigned int reserved2 : 4; - } processor_info; - - cpuid(1, info); - memcpy(&processor_info, info, sizeof(processor_info)); + struct + { + unsigned int stepping : 4; + unsigned int model : 4; + unsigned int family : 4; + unsigned int processor_type : 2; + unsigned int reserved1 : 2; + unsigned int ext_model : 4; + unsigned int ext_family : 8; + unsigned int reserved2 : 4; + } processor_info; + + cpuid(1, info); + memcpy(&processor_info, info, sizeof(processor_info)); + if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) { // Intel JCC erratum mitigation if (processor_info.family == 6) { const uint32_t model = processor_info.model | (processor_info.ext_model << 4); const uint32_t stepping = processor_info.stepping; // Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf - BranchesWithin32B = + set_flag(BRANCHES_WITHIN_32B, ((model == 0x4E) && (stepping == 0x3)) || ((model == 0x55) && (stepping == 0x4)) || ((model == 0x5E) && (stepping == 0x3)) || ((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) || ((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) || ((model == 0xA6) && (stepping == 0x0)) || - ((model == 0xAE) && (stepping == 0xA)); + ((model == 0xAE) && (stepping == 0xA))); } } + + if (strcmp((const char*)manufacturer, "AuthenticAMD") == 0) { + set_flag(AMD_RYZEN_FAMILY, processor_info.family == 0x17); + } } static std::atomic codeOffset; @@ -303,8 +311,20 @@ namespace randomx { void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { generateProgramPrologue(prog, pcfg); - memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize); - codePos += readDatasetSize; + + uint8_t* p; + uint32_t n; + if (check_flag(AMD_RYZEN_FAMILY)) { + p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked; + n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize; + } + else { + p = RandomX_CurrentConfig.codeReadDatasetTweaked; + n = RandomX_CurrentConfig.codeReadDatasetTweakedSize; + } + memcpy(code + codePos, p, n); + codePos += n; + generateProgramEpilogue(prog, pcfg); } @@ -396,7 +416,7 @@ namespace randomx { memcpy(code + codePos, codeLoopStore, loopStoreSize); codePos += loopStoreSize; - if (BranchesWithin32B) { + if (check_flag(BRANCHES_WITHIN_32B)) { const uint32_t branch_begin = static_cast(codePos); const uint32_t branch_end = static_cast(branch_begin + 9); @@ -989,6 +1009,8 @@ namespace randomx { codePos = pos; } + static const uint8_t AND_OR_MOV_LDMXCSR_RYZEN[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x3B, 0x44, 0x24, 0xFC, 0x74, 0x09, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC }; + void JitCompilerX86::h_CFROUND(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; @@ -1000,7 +1022,13 @@ namespace randomx { emit(ROL_RAX, p, pos); emitByte(rotate, p, pos); } - emit(AND_OR_MOV_LDMXCSR, p, pos); + + if (check_flag(AMD_RYZEN_FAMILY)) { + emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos); + } + else { + emit(AND_OR_MOV_LDMXCSR, p, pos); + } codePos = pos; } @@ -1012,7 +1040,7 @@ namespace randomx { const int reg = instr.dst; int32_t jmp_offset = registerUsage[reg] - (pos + 16); - if (BranchesWithin32B) { + if (check_flag(BRANCHES_WITHIN_32B)) { const uint32_t branch_begin = static_cast(pos + 7); const uint32_t branch_end = static_cast(branch_begin + ((jmp_offset >= -128) ? 9 : 13)); diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp index f1864018a..b47ff6ec5 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp @@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "crypto/randomx/common.hpp" namespace randomx { @@ -71,7 +72,23 @@ namespace randomx { uint8_t* code; int32_t codePos; - static bool BranchesWithin32B; + static std::atomic flags_set; + static constexpr uint64_t BRANCHES_WITHIN_32B = 1; + static constexpr uint64_t AMD_RYZEN_FAMILY = 2; + static uint64_t flags; + + static inline bool check_flag(uint64_t f) + { + return (flags & f) != 0; + } + + static inline void set_flag(uint64_t f, bool v) + { + if(v) + flags |= f; + else + flags &= ~f; + } static void applyTweaks(); void generateProgramPrologue(Program&, ProgramConfiguration&); diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S index c20cd7433..50019b7e5 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S @@ -45,6 +45,7 @@ .global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) +.global DECL(randomx_program_read_dataset_ryzen) .global DECL(randomx_program_read_dataset_sshash_init) .global DECL(randomx_program_read_dataset_sshash_fin) .global DECL(randomx_program_loop_store) @@ -92,6 +93,7 @@ DECL(randomx_program_prologue_first_load): and eax, RANDOMX_SCRATCHPAD_MASK ror rdx, 32 and edx, RANDOMX_SCRATCHPAD_MASK + stmxcsr dword ptr [rsp-20] jmp DECL(randomx_program_loop_begin) .balign 64 @@ -110,6 +112,9 @@ DECL(randomx_program_start): DECL(randomx_program_read_dataset): #include "asm/program_read_dataset.inc" +DECL(randomx_program_read_dataset_ryzen): + #include "asm/program_read_dataset_ryzen.inc" + DECL(randomx_program_read_dataset_sshash_init): #include "asm/program_read_dataset_sshash_init.inc" diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm index 73fa503ad..189c464c5 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm @@ -36,6 +36,7 @@ PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset +PUBLIC randomx_program_read_dataset_ryzen PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_dataset_init @@ -80,6 +81,7 @@ randomx_program_prologue_first_load PROC and eax, RANDOMX_SCRATCHPAD_MASK ror rdx, 32 and edx, RANDOMX_SCRATCHPAD_MASK + stmxcsr dword ptr [rsp-20] jmp randomx_program_loop_begin randomx_program_prologue_first_load ENDP @@ -103,6 +105,10 @@ randomx_program_read_dataset PROC include asm/program_read_dataset.inc randomx_program_read_dataset ENDP +randomx_program_read_dataset_ryzen PROC + include asm/program_read_dataset_ryzen.inc +randomx_program_read_dataset_ryzen ENDP + randomx_program_read_dataset_sshash_init PROC include asm/program_read_dataset_sshash_init.inc randomx_program_read_dataset_sshash_init ENDP @@ -220,4 +226,4 @@ _RANDOMX_JITX86_STATIC ENDS ENDIF -END \ No newline at end of file +END diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp index 0a62c986e..b0a7c5acb 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp @@ -37,6 +37,7 @@ extern "C" { void randomx_program_loop_load(); void randomx_program_start(); void randomx_program_read_dataset(); + void randomx_program_read_dataset_ryzen(); void randomx_program_read_dataset_sshash_init(); void randomx_program_read_dataset_sshash_fin(); void randomx_program_loop_store(); diff --git a/xmrstak/backend/cpu/crypto/randomx/randomx.cpp b/xmrstak/backend/cpu/crypto/randomx/randomx.cpp index 2937459c1..1c6b048d2 100644 --- a/xmrstak/backend/cpu/crypto/randomx/randomx.cpp +++ b/xmrstak/backend/cpu/crypto/randomx/randomx.cpp @@ -152,8 +152,15 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase() } { const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset; - const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_ryzen; memcpy(codeReadDatasetTweaked, a, b - a); + codeReadDatasetTweakedSize = b - a; + } + { + const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_ryzen; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + memcpy(codeReadDatasetRyzenTweaked, a, b - a); + codeReadDatasetRyzenTweakedSize = b - a; } { const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init; diff --git a/xmrstak/backend/cpu/crypto/randomx/randomx.h b/xmrstak/backend/cpu/crypto/randomx/randomx.h index 6fece9a4f..4dbecaef7 100644 --- a/xmrstak/backend/cpu/crypto/randomx/randomx.h +++ b/xmrstak/backend/cpu/crypto/randomx/randomx.h @@ -118,7 +118,10 @@ struct RandomX_ConfigurationBase rx_vec_i128 fillAes4Rx4_Key[8]; uint8_t codeShhPrefetchTweaked[20]; - uint8_t codeReadDatasetTweaked[64]; + uint8_t codeReadDatasetTweaked[72]; + uint32_t codeReadDatasetTweakedSize; + uint8_t codeReadDatasetRyzenTweaked[72]; + uint32_t codeReadDatasetRyzenTweakedSize; uint8_t codeReadDatasetLightSshInitTweaked[68]; uint8_t codePrefetchScratchpadTweaked[32];