ares-emulator · kannoneer · Sep 10, 2024 · Sep 10, 2024
diff --git a/ares/n64/cpu/context.cpp b/ares/n64/cpu/context.cpp
@@ -18,6 +18,9 @@ auto CPU::Context::setMode() -> void {
     break;
   }
 
+  jit.update(*this, self);
+  jitBits = jit.toBits();
+
   if(bits == 32) {
     physMask = 0x1fff'ffff;
     segment[0] = Segment::Mapped;
@@ -63,3 +66,22 @@ auto CPU::Context::setMode() -> void {
     }
   }
 }
+
+auto CPU::Context::JIT::update(const Context& ctx, const CPU& cpu) -> void {
+  singleInstruction = GDB::server.hasBreakpoints();
+  endian = Context::Endian(ctx.endian);
+  mode = Context::Mode(ctx.mode);
+  cop1Enabled = cpu.scc.status.enable.coprocessor1 > 0;
+  floatingPointMode = cpu.scc.status.floatingPointMode > 0;
+  is64bit = ctx.bits == 64;
+}
+
+auto CPU::Context::JIT::toBits() const -> u32 {
+  u32 bits = singleInstruction ? 1 << 6 : 0;
+  bits |= endian ? 1 << 7 : 0;
+  bits |= (mode & 0x03) << 9;
+  bits |= cop1Enabled ? 1 << 10 : 0;
+  bits |= floatingPointMode ? 1 << 11 : 0;
+  bits |= is64bit ? 1 << 12 : 0;
+  return bits;
+}
diff --git a/ares/n64/cpu/cpu.cpp b/ares/n64/cpu/cpu.cpp
@@ -113,7 +113,7 @@ auto CPU::instruction() -> void {
 
   if(Accuracy::CPU::Recompiler && recompiler.enabled && access.cache) {
     if(vaddrAlignedError<Word>(access.vaddr, false)) return;
-    auto block = recompiler.block(ipu.pc, access.paddr, GDB::server.hasBreakpoints());
+    auto block = recompiler.block(ipu.pc, access.paddr, context);
     block->execute(*this);
   } else {
     auto data = fetch(access);

diff --git a/ares/n64/cpu/cpu.hpp b/ares/n64/cpu/cpu.hpp
@@ -92,6 +92,18 @@ struct CPU : Thread {
     enum Mode : u32 { Kernel, Supervisor, User };
     enum Segment : u32 { Unused, Mapped, Cached, Direct, Cached32, Direct32, Kernel64, Supervisor64, User64 };
 
+    struct JIT {
+      bool singleInstruction;
+      Endian endian;
+      Mode mode;
+      bool cop1Enabled;
+      bool floatingPointMode;
+      bool is64bit;
+
+      auto update(const Context& ctx, const CPU& cpu) -> void;
+      auto toBits() const -> u32;
+    };
+
     auto littleEndian() const -> bool { return endian == Endian::Little; }
     auto bigEndian() const -> bool { return endian == Endian::Big; }
 
@@ -106,6 +118,8 @@ struct CPU : Thread {
     u32  mode;
     u32  bits;
     u32  segment[8];  //512_MiB chunks
+    u32  jitBits;
+    Context::JIT jit;
   } context{*this};
 
   //icache.cpp
@@ -863,7 +877,11 @@ struct CPU : Thread {
     };
 
     struct Pool {
-      Block* blocks[1 << 6];
+      struct Row {
+        Block* block;
+        u32 tag;
+      };
+      Row rows[1 << 6];
     };
 
     auto reset() -> void {
@@ -899,12 +917,16 @@ struct CPU : Thread {
     }
 
     auto pool(u32 address) -> Pool*;
-    auto block(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*;
+    auto computePoolKey(u32 address, u32 ctxHash) -> u32;
+    auto computePoolRow(u32 key) -> u32;
+    auto block(u64 vaddr, u32 address, const Context& ctx) -> Block*;
 
-    auto emit(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*;
+    auto emit(u64 vaddr, u32 address, Context::JIT ctx) -> Block*;
+    auto emitOverflowCheck(reg temp) -> sljit_jump*;
     auto emitZeroClear(u32 n) -> void;
-    auto emitEXECUTE(u32 instruction) -> bool;
-    auto emitSPECIAL(u32 instruction) -> bool;
+    auto checkDualAllowed(const Context::JIT& ctx) -> bool;
+    auto emitEXECUTE(u32 instruction, Context::JIT ctx) -> bool;
+    auto emitSPECIAL(u32 instruction, Context::JIT ctx) -> bool;
     auto emitREGIMM(u32 instruction) -> bool;
     auto emitSCC(u32 instruction) -> bool;
     auto emitFPU(u32 instruction) -> bool;

diff --git a/ares/n64/cpu/recompiler.cpp b/ares/n64/cpu/recompiler.cpp
@@ -9,10 +9,34 @@ auto CPU::Recompiler::pool(u32 address) -> Pool* {
   return pool;
 }
 
-auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> Block* {
-  if(auto block = pool(address)->blocks[address >> 2 & 0x3f]) return block;
-  auto block = emit(vaddr, address, singleInstruction);
-  pool(address)->blocks[address >> 2 & 0x3f] = block;
+auto CPU::Recompiler::computePoolKey(u32 address, u32 jitBits) -> u32 {
+  return (address >> 2 & 0x3f) | (jitBits & ~0x3f);
+}
+
+auto CPU::Recompiler::computePoolRow(u32 key) -> u32 {
+  // Jon Maiga's 'xmx' mixer, see https://jonkagstrom.com/bit-mixer-construction/
+  u64 x = key;
+  x ^= x >> 23;
+  x *= 0xff51afd7ed558ccdull;
+  x ^= x >> 23;
+  u32 row = x & 0x3f;
+  assert(row < sizeof(Pool::rows)/sizeof(Pool::rows[0]));
+  return row;
+}
+
+auto CPU::Recompiler::block(u64 vaddr, u32 address, const Context& ctx) -> Block* {
+  u32 key = computePoolKey(address, ctx.jitBits);
+  u32 row = computePoolRow(key);
+
+  if (pool(address)->rows[row].tag == key) {
+    if (auto block = pool(address)->rows[row].block) {
+      return block;
+    }
+  }
+
+  memory::jitprotect(false);
+  auto block = emit(vaddr, address, ctx.jit);
+  pool(address)->rows[row] = {.block = block, .tag = key};
   memory::jitprotect(true);
   return block;
 }
@@ -21,7 +45,7 @@ auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> B
 #define IpuReg(r)      sreg(1), offsetof(IPU, r) - IpuBase
 #define PipelineReg(x) mem(sreg(0), offsetof(CPU, pipeline) + offsetof(Pipeline, x))
 
-auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Block* {
+auto CPU::Recompiler::emit(u64 vaddr, u32 address, Context::JIT ctx) -> Block* {
   if(unlikely(allocator.available() < 1_MiB)) {
     print("CPU allocator flush\n");
     allocator.release();
@@ -46,7 +70,7 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
       mov32(reg(2), imm(instruction));
       call(&CPU::instructionPrologue);
     }
-    bool branched = emitEXECUTE(instruction);
+    bool branched = emitEXECUTE(instruction, ctx);
     if(unlikely(instruction == branchToSelf || instruction == jumpToSelf)) {
       //accelerate idle loops
       mov32(reg(1), imm(64 * 2));
@@ -60,7 +84,7 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
     vaddr += 4;
     address += 4;
     jumpToSelf += 4;
-    if(hasBranched || (address & 0xfc) == 0 || singleInstruction) break;  //block boundary
+    if(hasBranched || (address & 0xfc) == 0 || ctx.singleInstruction) break;  //block boundary
     hasBranched = branched;
     jumpEpilog(flag_nz);
   }
@@ -103,12 +127,31 @@ auto CPU::Recompiler::emitZeroClear(u32 n) -> void {
   if(n == 0) mov64(mem(IpuReg(r[0])), imm(0));
 }
 
-auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool {
+auto CPU::Recompiler::emitOverflowCheck(reg temp) -> sljit_jump* {
+    // If overflow flag set: throw an exception, skip the instruction via the 'end' label.
+    mov32_f(temp, flag_o);
+    auto didntOverflow = cmp32_jump(temp, imm(0), flag_eq);
+    call(&CPU::Exception::arithmeticOverflow, &cpu.exception);
+    auto end = jump();
+    setLabel(didntOverflow);
+    return end;
+}
+
+auto CPU::Recompiler::checkDualAllowed(const Context::JIT& ctx) -> bool {
+  if (ctx.mode != Context::Mode::Kernel && !ctx.is64bit) {
+    call(&CPU::Exception::reservedInstruction, &self.exception);
+    return false;
+  }
+
+  return true;
+}
+
+auto CPU::Recompiler::emitEXECUTE(u32 instruction, Context::JIT ctx) -> bool {
   switch(instruction >> 26) {
 
   //SPECIAL
   case 0x00: {
-    return emitSPECIAL(instruction);
+    return emitSPECIAL(instruction, ctx);
   }
 
   //REGIMM
@@ -284,21 +327,19 @@ auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool {
 
   //DADDI Rt,Rs,i16
   case 0x18: {
-    lea(reg(1), Rt);
-    lea(reg(2), Rs);
-    mov32(reg(3), imm(i16));
-    call(&CPU::DADDI);
-    emitZeroClear(Rtn);
+    if (!checkDualAllowed(ctx)) return 1;
+    add64(reg(0), mem(Rs), imm(i16), set_o);
+    auto skip = emitOverflowCheck(reg(2));
+    if(Rtn > 0) mov64(mem(Rt), reg(0));
+    setLabel(skip);
     return 0;
   }
 
   //DADDIU Rt,Rs,i16
   case 0x19: {
-    lea(reg(1), Rt);
-    lea(reg(2), Rs);
-    mov32(reg(3), imm(i16));
-    call(&CPU::DADDIU);
-    emitZeroClear(Rtn);
+    if (!checkDualAllowed(ctx)) return 1;
+    add64(reg(0), mem(Rs), imm(i16), set_o);
+    if(Rtn > 0) mov64(mem(Rt), reg(0));
     return 0;
   }
 
@@ -616,7 +657,7 @@ auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   return 0;
 }
 
-auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
+auto CPU::Recompiler::emitSPECIAL(u32 instruction, Context::JIT ctx) -> bool {
   switch(instruction & 0x3f) {
 
   //SLL Rd,Rt,Sa
@@ -760,11 +801,10 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSLLV Rd,Rt,Rs
   case 0x14: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    lea(reg(3), Rs);
-    call(&CPU::DSLLV);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    and64(reg(0), mem(Rs32), imm(63));
+    shl64(mem(Rd), mem(Rt), reg(0));
     return 0;
   }
 
@@ -776,21 +816,19 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSRLV Rd,Rt,Rs
   case 0x16: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    lea(reg(3), Rs);
-    call(&CPU::DSRLV);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    and64(reg(0), mem(Rs32), imm(63));
+    lshr64(mem(Rd), mem(Rt), reg(0));
     return 0;
   }
 
   //DSRAV Rd,Rt,Rs
   case 0x17: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    lea(reg(3), Rs);
-    call(&CPU::DSRAV);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    and64(reg(0), mem(Rs32), imm(63));
+    ashr64(mem(Rd), mem(Rt), reg(0));
     return 0;
   }
 
@@ -950,41 +988,42 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DADD Rd,Rs,Rt
   case 0x2c: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DADD);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    add64(reg(0), mem(Rs), mem(Rt), set_o);
+    auto skip = emitOverflowCheck(reg(2));
+    if(Rdn > 0) mov64(mem(Rd), reg(0));
+    setLabel(skip);
     return 0;
   }
 
   //DADDU Rd,Rs,Rt
   case 0x2d: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DADDU);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) {
+      return 1;
+    }
+
+    if(Rdn == 0) return 0;
+
+    add64(reg(0), mem(Rs), mem(Rt));
+    mov64(mem(Rd), reg(0));
     return 0;
   }
 
   //DSUB Rd,Rs,Rt
   case 0x2e: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DSUB);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    sub64(reg(0), mem(Rs), mem(Rt), set_o);
+    auto skip = emitOverflowCheck(reg(2));
+    if(Rdn > 0) mov64(mem(Rd), reg(0));
+    setLabel(skip);
     return 0;
   }
 
   //DSUBU Rd,Rs,Rt
   case 0x2f: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DSUBU);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    sub64(reg(0), mem(Rs), mem(Rt), set_o);
+    if(Rdn > 0) mov64(mem(Rd), reg(0));
     return 0;
   }
 
@@ -1050,11 +1089,9 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSLL Rd,Rt,Sa
   case 0x38: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    mov32(reg(3), imm(Sa));
-    call(&CPU::DSLL);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    shl64(mem(Rd), mem(Rt), imm(Sa));
     return 0;
   }
 
@@ -1076,21 +1113,17 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSRA Rd,Rt,Sa
   case 0x3b: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    mov32(reg(3), imm(Sa));
-    call(&CPU::DSRA);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    ashr64(mem(Rd), mem(Rt), imm(Sa));
     return 0;
   }
 
   //DSLL32 Rd,Rt,Sa
   case 0x3c: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    mov32(reg(3), imm(Sa+32));
-    call(&CPU::DSLL);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    shl64(mem(Rd), mem(Rt), imm(Sa+32));
     return 0;
   }