Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RX] Implement Ryzen speedups #2644

Merged
merged 1 commit into from
Dec 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
mov rcx, rbp ;# ecx = ma
shr rcx, 32
and ecx, RANDOMX_DATASET_BASE_MASK
xor rbp, rax ;# modify "mx"
mov rax, qword ptr [rdi+rcx]
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
xor r8, rax
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]

76 changes: 52 additions & 24 deletions xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,6 @@ namespace randomx {
{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
};

bool JitCompilerX86::BranchesWithin32B = false;

size_t JitCompilerX86::getCodeSize() {
return codePos < prologueSize ? 0 : codePos - prologueSize;
}
Expand All @@ -241,8 +239,14 @@ namespace randomx {
# endif
}

std::atomic<uint64_t> JitCompilerX86::flags_set(0);
uint64_t JitCompilerX86::flags = 0;
// CPU-specific tweaks
void JitCompilerX86::applyTweaks() {

if(flags_set.fetch_add(1) != 0)
return;

int32_t info[4];
cpuid(0, info);

Expand All @@ -252,38 +256,42 @@ namespace randomx {
manufacturer[2] = info[2];
manufacturer[3] = 0;

if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
struct
{
unsigned int stepping : 4;
unsigned int model : 4;
unsigned int family : 4;
unsigned int processor_type : 2;
unsigned int reserved1 : 2;
unsigned int ext_model : 4;
unsigned int ext_family : 8;
unsigned int reserved2 : 4;
} processor_info;

cpuid(1, info);
memcpy(&processor_info, info, sizeof(processor_info));
struct
{
unsigned int stepping : 4;
unsigned int model : 4;
unsigned int family : 4;
unsigned int processor_type : 2;
unsigned int reserved1 : 2;
unsigned int ext_model : 4;
unsigned int ext_family : 8;
unsigned int reserved2 : 4;
} processor_info;

cpuid(1, info);
memcpy(&processor_info, info, sizeof(processor_info));

if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
// Intel JCC erratum mitigation
if (processor_info.family == 6) {
const uint32_t model = processor_info.model | (processor_info.ext_model << 4);
const uint32_t stepping = processor_info.stepping;

// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
BranchesWithin32B =
set_flag(BRANCHES_WITHIN_32B,
((model == 0x4E) && (stepping == 0x3)) ||
((model == 0x55) && (stepping == 0x4)) ||
((model == 0x5E) && (stepping == 0x3)) ||
((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
((model == 0xA6) && (stepping == 0x0)) ||
((model == 0xAE) && (stepping == 0xA));
((model == 0xAE) && (stepping == 0xA)));
}
}

if (strcmp((const char*)manufacturer, "AuthenticAMD") == 0) {
set_flag(AMD_RYZEN_FAMILY, processor_info.family == 0x17);
}
}

static std::atomic<size_t> codeOffset;
Expand All @@ -303,8 +311,20 @@ namespace randomx {

void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
generateProgramPrologue(prog, pcfg);
memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize);
codePos += readDatasetSize;

uint8_t* p;
uint32_t n;
if (check_flag(AMD_RYZEN_FAMILY)) {
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
}
else {
p = RandomX_CurrentConfig.codeReadDatasetTweaked;
n = RandomX_CurrentConfig.codeReadDatasetTweakedSize;
}
memcpy(code + codePos, p, n);
codePos += n;

generateProgramEpilogue(prog, pcfg);
}

Expand Down Expand Up @@ -396,7 +416,7 @@ namespace randomx {
memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize;

if (BranchesWithin32B) {
if (check_flag(BRANCHES_WITHIN_32B)) {
const uint32_t branch_begin = static_cast<uint32_t>(codePos);
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + 9);

Expand Down Expand Up @@ -989,6 +1009,8 @@ namespace randomx {
codePos = pos;
}

static const uint8_t AND_OR_MOV_LDMXCSR_RYZEN[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x3B, 0x44, 0x24, 0xFC, 0x74, 0x09, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC };

void JitCompilerX86::h_CFROUND(const Instruction& instr) {
uint8_t* const p = code;
int pos = codePos;
Expand All @@ -1000,7 +1022,13 @@ namespace randomx {
emit(ROL_RAX, p, pos);
emitByte(rotate, p, pos);
}
emit(AND_OR_MOV_LDMXCSR, p, pos);

if (check_flag(AMD_RYZEN_FAMILY)) {
emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
}
else {
emit(AND_OR_MOV_LDMXCSR, p, pos);
}

codePos = pos;
}
Expand All @@ -1012,7 +1040,7 @@ namespace randomx {
const int reg = instr.dst;
int32_t jmp_offset = registerUsage[reg] - (pos + 16);

if (BranchesWithin32B) {
if (check_flag(BRANCHES_WITHIN_32B)) {
const uint32_t branch_begin = static_cast<uint32_t>(pos + 7);
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + ((jmp_offset >= -128) ? 9 : 13));

Expand Down
19 changes: 18 additions & 1 deletion xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <cstdint>
#include <cstring>
#include <vector>
#include <atomic>
#include "crypto/randomx/common.hpp"

namespace randomx {
Expand Down Expand Up @@ -71,7 +72,23 @@ namespace randomx {
uint8_t* code;
int32_t codePos;

static bool BranchesWithin32B;
static std::atomic<uint64_t> flags_set;
static constexpr uint64_t BRANCHES_WITHIN_32B = 1;
static constexpr uint64_t AMD_RYZEN_FAMILY = 2;
static uint64_t flags;

static inline bool check_flag(uint64_t f)
{
return (flags & f) != 0;
}

static inline void set_flag(uint64_t f, bool v)
{
if(v)
flags |= f;
else
flags &= ~f;
}

static void applyTweaks();
void generateProgramPrologue(Program&, ProgramConfiguration&);
Expand Down
5 changes: 5 additions & 0 deletions xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
.global DECL(randomx_program_loop_load)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_ryzen)
.global DECL(randomx_program_read_dataset_sshash_init)
.global DECL(randomx_program_read_dataset_sshash_fin)
.global DECL(randomx_program_loop_store)
Expand Down Expand Up @@ -92,6 +93,7 @@ DECL(randomx_program_prologue_first_load):
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
stmxcsr dword ptr [rsp-20]
jmp DECL(randomx_program_loop_begin)

.balign 64
Expand All @@ -110,6 +112,9 @@ DECL(randomx_program_start):
DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc"

DECL(randomx_program_read_dataset_ryzen):
#include "asm/program_read_dataset_ryzen.inc"

DECL(randomx_program_read_dataset_sshash_init):
#include "asm/program_read_dataset_sshash_init.inc"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_ryzen
PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init
Expand Down Expand Up @@ -80,6 +81,7 @@ randomx_program_prologue_first_load PROC
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
stmxcsr dword ptr [rsp-20]
jmp randomx_program_loop_begin
randomx_program_prologue_first_load ENDP

Expand All @@ -103,6 +105,10 @@ randomx_program_read_dataset PROC
include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP

randomx_program_read_dataset_ryzen PROC
include asm/program_read_dataset_ryzen.inc
randomx_program_read_dataset_ryzen ENDP

randomx_program_read_dataset_sshash_init PROC
include asm/program_read_dataset_sshash_init.inc
randomx_program_read_dataset_sshash_init ENDP
Expand Down Expand Up @@ -220,4 +226,4 @@ _RANDOMX_JITX86_STATIC ENDS

ENDIF

END
END
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ extern "C" {
void randomx_program_loop_load();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_read_dataset_ryzen();
void randomx_program_read_dataset_sshash_init();
void randomx_program_read_dataset_sshash_fin();
void randomx_program_loop_store();
Expand Down
9 changes: 8 additions & 1 deletion xmrstak/backend/cpu/crypto/randomx/randomx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,15 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
}
{
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_ryzen;
memcpy(codeReadDatasetTweaked, a, b - a);
codeReadDatasetTweakedSize = b - a;
}
{
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_ryzen;
const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
memcpy(codeReadDatasetRyzenTweaked, a, b - a);
codeReadDatasetRyzenTweakedSize = b - a;
}
{
const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
Expand Down
5 changes: 4 additions & 1 deletion xmrstak/backend/cpu/crypto/randomx/randomx.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,10 @@ struct RandomX_ConfigurationBase
rx_vec_i128 fillAes4Rx4_Key[8];

uint8_t codeShhPrefetchTweaked[20];
uint8_t codeReadDatasetTweaked[64];
uint8_t codeReadDatasetTweaked[72];
uint32_t codeReadDatasetTweakedSize;
uint8_t codeReadDatasetRyzenTweaked[72];
uint32_t codeReadDatasetRyzenTweakedSize;
uint8_t codeReadDatasetLightSshInitTweaked[68];
uint8_t codePrefetchScratchpadTweaked[32];

Expand Down