From dacacacdd1686d12265621d1cbbdc53ae97d701e Mon Sep 17 00:00:00 2001
From: tykkiman <tykkimies@protonmail.com>
Date: Tue, 10 Sep 2024 21:59:02 +0300
Subject: [PATCH 1/2] n64: access compiled blocks via a hash

We'd like the recompiler to take the execution context such as kernel
mode into account when compiling blocks. That's why it's necessary to
identify blocks not just by address but all the information used at
compile time. This is done by computing a 32-bit key and using that as
a block's identifier instead of the last six physical address bits like
was done before.

The execution state and its representation as bit vector are recomputed
only when needed, in this case each time Context::setMode() is called,
which happens on powerup, in both MTC0 and MFC0 instructions, and on
exceptions.

Since we have now 32-bit instead of 6-bit keys, the block() function
hashes the keys before mapping them to one of the 64 pool rows. The hash
function was chosen arbitrarily to be better than a simple multiplicative
hash and is likely not the best choice for this exact task.
---
 ares/n64/cpu/context.cpp    | 22 +++++++++++++++++++++
 ares/n64/cpu/cpu.cpp        |  2 +-
 ares/n64/cpu/cpu.hpp        | 26 ++++++++++++++++++++++---
 ares/n64/cpu/recompiler.cpp | 38 ++++++++++++++++++++++++++++++-------
 4 files changed, 77 insertions(+), 11 deletions(-)
diff --git a/ares/n64/cpu/context.cpp b/ares/n64/cpu/context.cpp
index 770cfd76a5..6e07d88f35 100644
--- a/ares/n64/cpu/context.cpp
+++ b/ares/n64/cpu/context.cpp
@@ -18,6 +18,9 @@ auto CPU::Context::setMode() -> void {
     break;
   }
 
+  jit.update(*this, self);
+  jitBits = jit.toBits();
+
   if(bits == 32) {
     physMask = 0x1fff'ffff;
     segment[0] = Segment::Mapped;
@@ -63,3 +66,22 @@ auto CPU::Context::setMode() -> void {
     }
   }
 }
+
+auto CPU::Context::JIT::update(const Context& ctx, const CPU& cpu) -> void {
+  singleInstruction = GDB::server.hasBreakpoints();
+  endian = Context::Endian(ctx.endian);
+  mode = Context::Mode(ctx.mode);
+  cop1Enabled = cpu.scc.status.enable.coprocessor1 > 0;
+  floatingPointMode = cpu.scc.status.floatingPointMode > 0;
+  is64bit = ctx.bits == 64;
+}
+
+auto CPU::Context::JIT::toBits() const -> u32 {
+  u32 bits = singleInstruction ? 1 << 6 : 0;
+  bits |= endian ? 1 << 7 : 0;
+  bits |= (mode & 0x03) << 9;
+  bits |= cop1Enabled ? 1 << 10 : 0;
+  bits |= floatingPointMode ? 1 << 11 : 0;
+  bits |= is64bit ? 1 << 12 : 0;
+  return bits;
+}
diff --git a/ares/n64/cpu/cpu.cpp b/ares/n64/cpu/cpu.cpp
index 29f03ce916..46fab32ec7 100644
--- a/ares/n64/cpu/cpu.cpp
+++ b/ares/n64/cpu/cpu.cpp
@@ -113,7 +113,7 @@ auto CPU::instruction() -> void {
 
   if(Accuracy::CPU::Recompiler && recompiler.enabled && access.cache) {
     if(vaddrAlignedError<Word>(access.vaddr, false)) return;
-    auto block = recompiler.block(ipu.pc, access.paddr, GDB::server.hasBreakpoints());
+    auto block = recompiler.block(ipu.pc, access.paddr, context);
     block->execute(*this);
   } else {
     auto data = fetch(access);
diff --git a/ares/n64/cpu/cpu.hpp b/ares/n64/cpu/cpu.hpp
index bddb7222a1..bc2e062608 100644
--- a/ares/n64/cpu/cpu.hpp
+++ b/ares/n64/cpu/cpu.hpp
@@ -92,6 +92,18 @@ struct CPU : Thread {
     enum Mode : u32 { Kernel, Supervisor, User };
     enum Segment : u32 { Unused, Mapped, Cached, Direct, Cached32, Direct32, Kernel64, Supervisor64, User64 };
 
+    struct JIT {
+      bool singleInstruction;
+      Endian endian;
+      Mode mode;
+      bool cop1Enabled;
+      bool floatingPointMode;
+      bool is64bit;
+
+      auto update(const Context& ctx, const CPU& cpu) -> void;
+      auto toBits() const -> u32;
+    };
+
     auto littleEndian() const -> bool { return endian == Endian::Little; }
     auto bigEndian() const -> bool { return endian == Endian::Big; }
 
@@ -106,6 +118,8 @@ struct CPU : Thread {
     u32  mode;
     u32  bits;
     u32  segment[8];  //512_MiB chunks
+    u32  jitBits;
+    Context::JIT jit;
   } context{*this};
 
   //icache.cpp
@@ -863,7 +877,11 @@ struct CPU : Thread {
     };
 
     struct Pool {
-      Block* blocks[1 << 6];
+      struct Row {
+        Block* block;
+        u32 tag;
+      };
+      Row rows[1 << 6];
     };
 
     auto reset() -> void {
@@ -899,9 +917,11 @@ struct CPU : Thread {
     }
 
     auto pool(u32 address) -> Pool*;
-    auto block(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*;
+    auto computePoolKey(u32 address, u32 ctxHash) -> u32;
+    auto computePoolRow(u32 key) -> u32;
+    auto block(u64 vaddr, u32 address, const Context& ctx) -> Block*;
 
-    auto emit(u64 vaddr, u32 address, bool singleInstruction = false) -> Block*;
+    auto emit(u64 vaddr, u32 address, Context::JIT ctx) -> Block*;
     auto emitZeroClear(u32 n) -> void;
     auto emitEXECUTE(u32 instruction) -> bool;
     auto emitSPECIAL(u32 instruction) -> bool;
diff --git a/ares/n64/cpu/recompiler.cpp b/ares/n64/cpu/recompiler.cpp
index 68a6962243..c17b394d37 100644
--- a/ares/n64/cpu/recompiler.cpp
+++ b/ares/n64/cpu/recompiler.cpp
@@ -9,10 +9,34 @@ auto CPU::Recompiler::pool(u32 address) -> Pool* {
   return pool;
 }
 
-auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> Block* {
-  if(auto block = pool(address)->blocks[address >> 2 & 0x3f]) return block;
-  auto block = emit(vaddr, address, singleInstruction);
-  pool(address)->blocks[address >> 2 & 0x3f] = block;
+auto CPU::Recompiler::computePoolKey(u32 address, u32 jitBits) -> u32 {
+  return (address >> 2 & 0x3f) | (jitBits & ~0x3f);
+}
+
+auto CPU::Recompiler::computePoolRow(u32 key) -> u32 {
+  // Jon Maiga's 'xmx' mixer, see https://jonkagstrom.com/bit-mixer-construction/
+  u64 x = key;
+  x ^= x >> 23;
+  x *= 0xff51afd7ed558ccdull;
+  x ^= x >> 23;
+  u32 row = x & 0x3f;
+  assert(row < sizeof(Pool::rows)/sizeof(Pool::rows[0]));
+  return row;
+}
+
+auto CPU::Recompiler::block(u64 vaddr, u32 address, const Context& ctx) -> Block* {
+  u32 key = computePoolKey(address, ctx.jitBits);
+  u32 row = computePoolRow(key);
+
+  if (pool(address)->rows[row].tag == key) {
+    if (auto block = pool(address)->rows[row].block) {
+      return block;
+    }
+  }
+
+  memory::jitprotect(false);
+  auto block = emit(vaddr, address, ctx.jit);
+  pool(address)->rows[row] = {.block = block, .tag = key};
   memory::jitprotect(true);
   return block;
 }
@@ -21,7 +45,7 @@ auto CPU::Recompiler::block(u64 vaddr, u32 address, bool singleInstruction) -> B
 #define IpuReg(r)      sreg(1), offsetof(IPU, r) - IpuBase
 #define PipelineReg(x) mem(sreg(0), offsetof(CPU, pipeline) + offsetof(Pipeline, x))
 
-auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Block* {
+auto CPU::Recompiler::emit(u64 vaddr, u32 address, Context::JIT ctx) -> Block* {
   if(unlikely(allocator.available() < 1_MiB)) {
     print("CPU allocator flush\n");
     allocator.release();
@@ -46,7 +70,7 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
       mov32(reg(2), imm(instruction));
       call(&CPU::instructionPrologue);
     }
-    bool branched = emitEXECUTE(instruction);
+    bool branched = emitEXECUTE(instruction, ctx);
     if(unlikely(instruction == branchToSelf || instruction == jumpToSelf)) {
       //accelerate idle loops
       mov32(reg(1), imm(64 * 2));
@@ -60,7 +84,7 @@ auto CPU::Recompiler::emit(u64 vaddr, u32 address, bool singleInstruction) -> Bl
     vaddr += 4;
     address += 4;
     jumpToSelf += 4;
-    if(hasBranched || (address & 0xfc) == 0 || singleInstruction) break;  //block boundary
+    if(hasBranched || (address & 0xfc) == 0 || ctx.singleInstruction) break;  //block boundary
     hasBranched = branched;
     jumpEpilog(flag_nz);
   }

From ba4504aaa1571f7b9b34699876da5b7d08db8415 Mon Sep 17 00:00:00 2001
From: tykkiman <tykkimies@protonmail.com>
Date: Tue, 10 Sep 2024 21:59:18 +0300
Subject: [PATCH 2/2] n64: inline simple dual mode operations

* Pass JITContext down to leaf emit functions.
* Emit inline implementations of basic 64-bit operations.
* Use block compile-time information to elide kernel mode checks of
the now inlined operations.
---
 ares/n64/cpu/cpu.hpp        |   6 +-
 ares/n64/cpu/recompiler.cpp | 135 +++++++++++++++++++-----------------
 2 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/ares/n64/cpu/cpu.hpp b/ares/n64/cpu/cpu.hpp
index bc2e062608..a42b12ea08 100644
--- a/ares/n64/cpu/cpu.hpp
+++ b/ares/n64/cpu/cpu.hpp
@@ -922,9 +922,11 @@ struct CPU : Thread {
     auto block(u64 vaddr, u32 address, const Context& ctx) -> Block*;
 
     auto emit(u64 vaddr, u32 address, Context::JIT ctx) -> Block*;
+    auto emitOverflowCheck(reg temp) -> sljit_jump*;
     auto emitZeroClear(u32 n) -> void;
-    auto emitEXECUTE(u32 instruction) -> bool;
-    auto emitSPECIAL(u32 instruction) -> bool;
+    auto checkDualAllowed(const Context::JIT& ctx) -> bool;
+    auto emitEXECUTE(u32 instruction, Context::JIT ctx) -> bool;
+    auto emitSPECIAL(u32 instruction, Context::JIT ctx) -> bool;
     auto emitREGIMM(u32 instruction) -> bool;
     auto emitSCC(u32 instruction) -> bool;
     auto emitFPU(u32 instruction) -> bool;
diff --git a/ares/n64/cpu/recompiler.cpp b/ares/n64/cpu/recompiler.cpp
index c17b394d37..f08b4c6187 100644
--- a/ares/n64/cpu/recompiler.cpp
+++ b/ares/n64/cpu/recompiler.cpp
@@ -127,12 +127,31 @@ auto CPU::Recompiler::emitZeroClear(u32 n) -> void {
   if(n == 0) mov64(mem(IpuReg(r[0])), imm(0));
 }
 
-auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool {
+auto CPU::Recompiler::emitOverflowCheck(reg temp) -> sljit_jump* {
+    // If overflow flag set: throw an exception, skip the instruction via the 'end' label.
+    mov32_f(temp, flag_o);
+    auto didntOverflow = cmp32_jump(temp, imm(0), flag_eq);
+    call(&CPU::Exception::arithmeticOverflow, &cpu.exception);
+    auto end = jump();
+    setLabel(didntOverflow);
+    return end;
+}
+
+auto CPU::Recompiler::checkDualAllowed(const Context::JIT& ctx) -> bool {
+  if (ctx.mode != Context::Mode::Kernel && !ctx.is64bit) {
+    call(&CPU::Exception::reservedInstruction, &self.exception);
+    return false;
+  }
+
+  return true;
+}
+
+auto CPU::Recompiler::emitEXECUTE(u32 instruction, Context::JIT ctx) -> bool {
   switch(instruction >> 26) {
 
   //SPECIAL
   case 0x00: {
-    return emitSPECIAL(instruction);
+    return emitSPECIAL(instruction, ctx);
   }
 
   //REGIMM
@@ -308,21 +327,19 @@ auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool {
 
   //DADDI Rt,Rs,i16
   case 0x18: {
-    lea(reg(1), Rt);
-    lea(reg(2), Rs);
-    mov32(reg(3), imm(i16));
-    call(&CPU::DADDI);
-    emitZeroClear(Rtn);
+    if (!checkDualAllowed(ctx)) return 1;
+    add64(reg(0), mem(Rs), imm(i16), set_o);
+    auto skip = emitOverflowCheck(reg(2));
+    if(Rtn > 0) mov64(mem(Rt), reg(0));
+    setLabel(skip);
     return 0;
   }
 
   //DADDIU Rt,Rs,i16
   case 0x19: {
-    lea(reg(1), Rt);
-    lea(reg(2), Rs);
-    mov32(reg(3), imm(i16));
-    call(&CPU::DADDIU);
-    emitZeroClear(Rtn);
+    if (!checkDualAllowed(ctx)) return 1;
+    add64(reg(0), mem(Rs), imm(i16), set_o);
+    if(Rtn > 0) mov64(mem(Rt), reg(0));
     return 0;
   }
 
@@ -640,7 +657,7 @@ auto CPU::Recompiler::emitEXECUTE(u32 instruction) -> bool {
   return 0;
 }
 
-auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
+auto CPU::Recompiler::emitSPECIAL(u32 instruction, Context::JIT ctx) -> bool {
   switch(instruction & 0x3f) {
 
   //SLL Rd,Rt,Sa
@@ -784,11 +801,10 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSLLV Rd,Rt,Rs
   case 0x14: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    lea(reg(3), Rs);
-    call(&CPU::DSLLV);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    and64(reg(0), mem(Rs32), imm(63));
+    shl64(mem(Rd), mem(Rt), reg(0));
     return 0;
   }
 
@@ -800,21 +816,19 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSRLV Rd,Rt,Rs
   case 0x16: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    lea(reg(3), Rs);
-    call(&CPU::DSRLV);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    and64(reg(0), mem(Rs32), imm(63));
+    lshr64(mem(Rd), mem(Rt), reg(0));
     return 0;
   }
 
   //DSRAV Rd,Rt,Rs
   case 0x17: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    lea(reg(3), Rs);
-    call(&CPU::DSRAV);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    and64(reg(0), mem(Rs32), imm(63));
+    ashr64(mem(Rd), mem(Rt), reg(0));
     return 0;
   }
 
@@ -974,41 +988,42 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DADD Rd,Rs,Rt
   case 0x2c: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DADD);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    add64(reg(0), mem(Rs), mem(Rt), set_o);
+    auto skip = emitOverflowCheck(reg(2));
+    if(Rdn > 0) mov64(mem(Rd), reg(0));
+    setLabel(skip);
     return 0;
   }
 
   //DADDU Rd,Rs,Rt
   case 0x2d: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DADDU);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) {
+      return 1;
+    }
+
+    if(Rdn == 0) return 0;
+
+    add64(reg(0), mem(Rs), mem(Rt));
+    mov64(mem(Rd), reg(0));
     return 0;
   }
 
   //DSUB Rd,Rs,Rt
   case 0x2e: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DSUB);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    sub64(reg(0), mem(Rs), mem(Rt), set_o);
+    auto skip = emitOverflowCheck(reg(2));
+    if(Rdn > 0) mov64(mem(Rd), reg(0));
+    setLabel(skip);
     return 0;
   }
 
   //DSUBU Rd,Rs,Rt
   case 0x2f: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rs);
-    lea(reg(3), Rt);
-    call(&CPU::DSUBU);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    sub64(reg(0), mem(Rs), mem(Rt), set_o);
+    if(Rdn > 0) mov64(mem(Rd), reg(0));
     return 0;
   }
 
@@ -1074,11 +1089,9 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSLL Rd,Rt,Sa
   case 0x38: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    mov32(reg(3), imm(Sa));
-    call(&CPU::DSLL);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    shl64(mem(Rd), mem(Rt), imm(Sa));
     return 0;
   }
 
@@ -1100,21 +1113,17 @@ auto CPU::Recompiler::emitSPECIAL(u32 instruction) -> bool {
 
   //DSRA Rd,Rt,Sa
   case 0x3b: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    mov32(reg(3), imm(Sa));
-    call(&CPU::DSRA);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    ashr64(mem(Rd), mem(Rt), imm(Sa));
     return 0;
   }
 
   //DSLL32 Rd,Rt,Sa
   case 0x3c: {
-    lea(reg(1), Rd);
-    lea(reg(2), Rt);
-    mov32(reg(3), imm(Sa+32));
-    call(&CPU::DSLL);
-    emitZeroClear(Rdn);
+    if (!checkDualAllowed(ctx)) return 1;
+    if (Rdn == 0) return 0;
+    shl64(mem(Rd), mem(Rt), imm(Sa+32));
     return 0;
   }