diff --git a/ares/n64/cpu/context.cpp b/ares/n64/cpu/context.cpp index 770cfd76a5..6e07d88f35 100644 --- a/ares/n64/cpu/context.cpp +++ b/ares/n64/cpu/context.cpp @@ -18,6 +18,9 @@ auto CPU::Context::setMode() -> void { break; } + jit.update(*this, self); + jitBits = jit.toBits(); + if(bits == 32) { physMask = 0x1fff'ffff; segment[0] = Segment::Mapped; @@ -63,3 +66,22 @@ auto CPU::Context::setMode() -> void { } } } + +auto CPU::Context::JIT::update(const Context& ctx, const CPU& cpu) -> void { + singleInstruction = GDB::server.hasBreakpoints(); + endian = Context::Endian(ctx.endian); + mode = Context::Mode(ctx.mode); + cop1Enabled = cpu.scc.status.enable.coprocessor1 > 0; + floatingPointMode = cpu.scc.status.floatingPointMode > 0; + is64bit = ctx.bits == 64; +} + +auto CPU::Context::JIT::toBits() const -> u32 { + u32 bits = singleInstruction ? 1 << 6 : 0; + bits |= endian ? 1 << 7 : 0; + bits |= (mode & 0x03) << 9; + bits |= cop1Enabled ? 1 << 10 : 0; + bits |= floatingPointMode ? 1 << 11 : 0; + bits |= is64bit ? 1 << 12 : 0; + return bits; +} diff --git a/ares/n64/cpu/cpu.cpp b/ares/n64/cpu/cpu.cpp index 29f03ce916..46fab32ec7 100644 --- a/ares/n64/cpu/cpu.cpp +++ b/ares/n64/cpu/cpu.cpp @@ -113,7 +113,7 @@ auto CPU::instruction() -> void { if(Accuracy::CPU::Recompiler && recompiler.enabled && access.cache) { if(vaddrAlignedError(access.vaddr, false)) return; - auto block = recompiler.block(ipu.pc, access.paddr, GDB::server.hasBreakpoints()); + auto block = recompiler.block(ipu.pc, access.paddr, context); block->execute(*this); } else { auto data = fetch(access); diff --git a/ares/n64/cpu/cpu.hpp b/ares/n64/cpu/cpu.hpp index bddb7222a1..a42b12ea08 100644 --- a/ares/n64/cpu/cpu.hpp +++ b/ares/n64/cpu/cpu.hpp @@ -92,6 +92,18 @@ struct CPU : Thread { enum Mode : u32 { Kernel, Supervisor, User }; enum Segment : u32 { Unused, Mapped, Cached, Direct, Cached32, Direct32, Kernel64, Supervisor64, User64 }; + struct JIT { + bool singleInstruction; + Endian endian; + Mode mode; + bool cop1Enabled; + bool floatingPointMode; + bool is64bit; + + auto update(const Context& ctx, const CPU& cpu) -> void; + auto toBits() const -> u32; + }; + auto littleEndian() const -> bool { return endian == Endian::Little; } auto bigEndian() const -> bool { return endian == Endian::Big; } @@ -106,6 +118,8 @@ struct CPU : Thread { u32 mode; u32 bits; u32 segment[8]; //512_MiB chunks + u32 jitBits; + Context::JIT jit; } context{*this}; //icache.cpp @@ -863,7 +877,11 @@ struct CPU : Thread { }; struct Pool { - Block* blocks[1 << 6]; + struct Row { + Block* block; + u32 tag; + }; + Row rows[1 << 6]; }; auto reset() -> void { @@ -899,12 +917,16 @@ struct CPU : Thread { } auto pool(u32 address) -> Pool*; - auto block(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*; + auto computePoolKey(u32 address, u32 ctxHash) -> u32; + auto computePoolRow(u32 key) -> u32; + auto block(u64 vaddr, u32 address, const Context& ctx) -> Block*; - auto emit(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*; + auto emit(u64 vaddr, u32 address, Context::JIT ctx) -> Block*; + auto emitOverflowCheck(reg temp) -> sljit_jump*; auto emitZeroClear(u32 n) -> void; - auto emitEXECUTE(u32 instruction) -> bool; - auto emitSPECIAL(u32 instruction) -> bool; + auto checkDualAllowed(const Context::JIT& ctx) -> bool; + auto emitEXECUTE(u32 instruction, Context::JIT ctx) -> bool; + auto emitSPECIAL(u32 instruction, Context::JIT ctx) -> bool; auto emitREGIMM(u32 instruction) -> bool; auto emitSCC(u32 instruction) -> bool; auto emitFPU(u32 instruction) -> bool; diff --git a/ares/n64/cpu/recompiler.cpp b/ares/n64/cpu/recompiler.cpp index 68a6962243..f08b4c6187 100644 --- a/ares/n64/cpu/recompiler.cpp +++ b/ares/n64/cpu/recompiler.cpp @@ -9,10 +9,34 @@ auto CPU::Recompiler::pool(u32 address) -> Pool* { return pool; } -auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> Block* { - if(auto block = pool(address)->blocks[address >> 2 & 0x3f]) return block; - auto block = emit(vaddr, address, singleInstruction); - pool(address)->blocks[address >> 2 & 0x3f] = block; +auto CPU::Recompiler::computePoolKey(u32 address, u32 jitBits) -> u32 { + return (address >> 2 & 0x3f) | (jitBits & ~0x3f); +} + +auto CPU::Recompiler::computePoolRow(u32 key) -> u32 { + // Jon Maiga's 'xmx' mixer, see https://jonkagstrom.com/bit-mixer-construction/ + u64 x = key; + x ^= x >> 23; + x *= 0xff51afd7ed558ccdull; + x ^= x >> 23; + u32 row = x & 0x3f; + assert(row < sizeof(Pool::rows)/sizeof(Pool::rows[0])); + return row; +} + +auto CPU::Recompiler::block(u64 vaddr, u32 address, const Context& ctx) -> Block* { + u32 key = computePoolKey(address, ctx.jitBits); + u32 row = computePoolRow(key); + + if (pool(address)->rows[row].tag == key) { + if (auto block = pool(address)->rows[row].block) { + return block; + } + } + + memory::jitprotect(false); + auto block = emit(vaddr, address, ctx.jit); + pool(address)->rows[row] = {.block = block, .tag = key}; memory::jitprotect(true); return block; } @@ -21,7 +45,7 @@ auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> B #define IpuReg(r) sreg(1), offsetof(IPU, r) - IpuBase #define PipelineReg(x) mem(sreg(0), offsetof(CPU, pipeline) + offsetof(Pipeline, x)) -auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Block* { +auto CPU::Recompiler::emit(u64 vaddr, u32 address, Context::JIT ctx) -> Block* { if(unlikely(allocator.available() < 1_MiB)) { print("CPU allocator flush\n"); allocator.release(); @@ -46,7 +70,7 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl mov32(reg(2), imm(instruction)); call(&CPU::instructionPrologue); } - bool branched = emitEXECUTE(instruction); + bool branched = emitEXECUTE(instruction, ctx); if(unlikely(instruction == branchToSelf || instruction == jumpToSelf)) { //accelerate idle loops mov32(reg(1), imm(64 * 2)); @@ -60,7 +84,7 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl vaddr += 4; address += 4; jumpToSelf += 4; - if(hasBranched || (address & 0xfc) == 0 || singleInstruction) break; //block boundary + if(hasBranched || (address & 0xfc) == 0 || ctx.singleInstruction) break; //block boundary hasBranched = branched; jumpEpilog(flag_nz); } @@ -103,12 +127,31 @@ auto CPU::Recompiler::emitZeroClear(u32 n) -> void { if(n == 0) mov64(mem(IpuReg(r[0])), imm(0)); } -auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool { +auto CPU::Recompiler::emitOverflowCheck(reg temp) -> sljit_jump* { + // If overflow flag set: throw an exception, skip the instruction via the 'end' label. + mov32_f(temp, flag_o); + auto didntOverflow = cmp32_jump(temp, imm(0), flag_eq); + call(&CPU::Exception::arithmeticOverflow, &cpu.exception); + auto end = jump(); + setLabel(didntOverflow); + return end; +} + +auto CPU::Recompiler::checkDualAllowed(const Context::JIT& ctx) -> bool { + if (ctx.mode != Context::Mode::Kernel && !ctx.is64bit) { + call(&CPU::Exception::reservedInstruction, &self.exception); + return false; + } + + return true; +} + +auto CPU::Recompiler::emitEXECUTE(u32 instruction, Context::JIT ctx) -> bool { switch(instruction >> 26) { //SPECIAL case 0x00: { - return emitSPECIAL(instruction); + return emitSPECIAL(instruction, ctx); } //REGIMM @@ -284,21 +327,19 @@ auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool { //DADDI Rt,Rs,i16 case 0x18: { - lea(reg(1), Rt); - lea(reg(2), Rs); - mov32(reg(3), imm(i16)); - call(&CPU::DADDI); - emitZeroClear(Rtn); + if (!checkDualAllowed(ctx)) return 1; + add64(reg(0), mem(Rs), imm(i16), set_o); + auto skip = emitOverflowCheck(reg(2)); + if(Rtn > 0) mov64(mem(Rt), reg(0)); + setLabel(skip); return 0; } //DADDIU Rt,Rs,i16 case 0x19: { - lea(reg(1), Rt); - lea(reg(2), Rs); - mov32(reg(3), imm(i16)); - call(&CPU::DADDIU); - emitZeroClear(Rtn); + if (!checkDualAllowed(ctx)) return 1; + add64(reg(0), mem(Rs), imm(i16), set_o); + if(Rtn > 0) mov64(mem(Rt), reg(0)); return 0; } @@ -616,7 +657,7 @@ auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool { return 0; } -auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool { +auto CPU::Recompiler::emitSPECIAL(u32 instruction, Context::JIT ctx) -> bool { switch(instruction & 0x3f) { //SLL Rd,Rt,Sa @@ -760,11 +801,10 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool { //DSLLV Rd,Rt,Rs case 0x14: { - lea(reg(1), Rd); - lea(reg(2), Rt); - lea(reg(3), Rs); - call(&CPU::DSLLV); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + if (Rdn == 0) return 0; + and64(reg(0), mem(Rs32), imm(63)); + shl64(mem(Rd), mem(Rt), reg(0)); return 0; } @@ -776,21 +816,19 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool { //DSRLV Rd,Rt,Rs case 0x16: { - lea(reg(1), Rd); - lea(reg(2), Rt); - lea(reg(3), Rs); - call(&CPU::DSRLV); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + if (Rdn == 0) return 0; + and64(reg(0), mem(Rs32), imm(63)); + lshr64(mem(Rd), mem(Rt), reg(0)); return 0; } //DSRAV Rd,Rt,Rs case 0x17: { - lea(reg(1), Rd); - lea(reg(2), Rt); - lea(reg(3), Rs); - call(&CPU::DSRAV); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + if (Rdn == 0) return 0; + and64(reg(0), mem(Rs32), imm(63)); + ashr64(mem(Rd), mem(Rt), reg(0)); return 0; } @@ -950,41 +988,42 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool { //DADD Rd,Rs,Rt case 0x2c: { - lea(reg(1), Rd); - lea(reg(2), Rs); - lea(reg(3), Rt); - call(&CPU::DADD); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + add64(reg(0), mem(Rs), mem(Rt), set_o); + auto skip = emitOverflowCheck(reg(2)); + if(Rdn > 0) mov64(mem(Rd), reg(0)); + setLabel(skip); return 0; } //DADDU Rd,Rs,Rt case 0x2d: { - lea(reg(1), Rd); - lea(reg(2), Rs); - lea(reg(3), Rt); - call(&CPU::DADDU); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) { + return 1; + } + + if(Rdn == 0) return 0; + + add64(reg(0), mem(Rs), mem(Rt)); + mov64(mem(Rd), reg(0)); return 0; } //DSUB Rd,Rs,Rt case 0x2e: { - lea(reg(1), Rd); - lea(reg(2), Rs); - lea(reg(3), Rt); - call(&CPU::DSUB); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + sub64(reg(0), mem(Rs), mem(Rt), set_o); + auto skip = emitOverflowCheck(reg(2)); + if(Rdn > 0) mov64(mem(Rd), reg(0)); + setLabel(skip); return 0; } //DSUBU Rd,Rs,Rt case 0x2f: { - lea(reg(1), Rd); - lea(reg(2), Rs); - lea(reg(3), Rt); - call(&CPU::DSUBU); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + sub64(reg(0), mem(Rs), mem(Rt), set_o); + if(Rdn > 0) mov64(mem(Rd), reg(0)); return 0; } @@ -1050,11 +1089,9 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool { //DSLL Rd,Rt,Sa case 0x38: { - lea(reg(1), Rd); - lea(reg(2), Rt); - mov32(reg(3), imm(Sa)); - call(&CPU::DSLL); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + if (Rdn == 0) return 0; + shl64(mem(Rd), mem(Rt), imm(Sa)); return 0; } @@ -1076,21 +1113,17 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool { //DSRA Rd,Rt,Sa case 0x3b: { - lea(reg(1), Rd); - lea(reg(2), Rt); - mov32(reg(3), imm(Sa)); - call(&CPU::DSRA); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + if (Rdn == 0) return 0; + ashr64(mem(Rd), mem(Rt), imm(Sa)); return 0; } //DSLL32 Rd,Rt,Sa case 0x3c: { - lea(reg(1), Rd); - lea(reg(2), Rt); - mov32(reg(3), imm(Sa+32)); - call(&CPU::DSLL); - emitZeroClear(Rdn); + if (!checkDualAllowed(ctx)) return 1; + if (Rdn == 0) return 0; + shl64(mem(Rd), mem(Rt), imm(Sa+32)); return 0; }