Merge pull request #19262 from hrydgard/ir-specialization

IR: Add some interpreter-only IR instructions for faster interpretation
hrydgard · Jun 7, 2024 · 27815c7 · 27815c7
2 parents 55fecce + 0c24629
commit 27815c7
Showing 7 changed files with 163 additions and 7 deletions.
diff --git a/Core/MIPS/IR/IRFrontend.cpp b/Core/MIPS/IR/IRFrontend.cpp
@@ -284,14 +284,15 @@ void IRFrontend::DoJit(u32 em_address, std::vector<IRInst> &instructions, u32 &m
 			&PropagateConstants,
 			&PurgeTemps,
 			&ReduceVec4Flush,
+			&OptimizeLoadsAfterStores,
 			// &ReorderLoadStore,
 			// &MergeLoadStore,
 			// &ThreeOpToTwoOp,
 		};
 
 		if (opts.optimizeForInterpreter) {
 			// Add special passes here.
-			// passes.push_back(&ReorderLoadStore);
+			passes.push_back(&OptimizeForInterpreter);
 		}
 		if (IRApplyPasses(passes.data(), passes.size(), ir, simplified, opts))
 			logBlocks = 1;

diff --git a/Core/MIPS/IR/IRInst.cpp b/Core/MIPS/IR/IRInst.cpp
@@ -1,4 +1,5 @@
 #include "Common/CommonFuncs.h"
+#include "Common/Log.h"
 #include "Core/MIPS/IR/IRInst.h"
 #include "Core/MIPS/MIPSDebugInterface.h"
 #include "Core/HLE/ReplaceTables.h"
@@ -8,6 +9,7 @@
 //  _ = ignore
 //  G = GPR register
 //  C = 32-bit constant from array
+//  c = 8-bit constant from array
 //  I = immediate value from instruction
 //  F = FPR register, single
 //  V = FPR register, Vec4. Reg number always divisible by 4.
@@ -29,10 +31,13 @@ static const IRMeta irMeta[] = {
 	{ IROp::Or, "Or", "GGG" },
 	{ IROp::Xor, "Xor", "GGG" },
 	{ IROp::AddConst, "AddConst", "GGC" },
+	{ IROp::OptAddConst, "OptAddConst", "GC" },
 	{ IROp::SubConst, "SubConst", "GGC" },
 	{ IROp::AndConst, "AndConst", "GGC" },
 	{ IROp::OrConst, "OrConst", "GGC" },
 	{ IROp::XorConst, "XorConst", "GGC" },
+	{ IROp::OptAndConst, "OptAndConst", "GC" },
+	{ IROp::OptOrConst, "OptOrConst", "GC" },
 	{ IROp::Shl, "Shl", "GGG" },
 	{ IROp::Shr, "Shr", "GGG" },
 	{ IROp::Sar, "Sar", "GGG" },
@@ -115,6 +120,7 @@ static const IRMeta irMeta[] = {
 	{ IROp::FSatMinus1_1, "FSat(-1 - 1)", "FF" },
 	{ IROp::FMovFromGPR, "FMovFromGPR", "FG" },
 	{ IROp::FMovToGPR, "FMovToGPR", "GF" },
+	{ IROp::OptFMovToGPRShr8, "OptFMovToGPRShr8", "GF" },
 	{ IROp::FpCondFromReg, "FpCondFromReg", "_G" },
 	{ IROp::FpCondToReg, "FpCondToReg", "G" },
 	{ IROp::FpCtrlFromReg, "FpCtrlFromReg", "_G" },
@@ -128,7 +134,7 @@ static const IRMeta irMeta[] = {
 	{ IROp::FCmpVfpuAggregate, "FCmpVfpuAggregate", "I" },
 	{ IROp::Vec4Init, "Vec4Init", "Vv" },
 	{ IROp::Vec4Shuffle, "Vec4Shuffle", "VVs" },
-	{ IROp::Vec4Blend, "Vec4Blend", "VVVC" },
+	{ IROp::Vec4Blend, "Vec4Blend", "VVVc" },
 	{ IROp::Vec4Mov, "Vec4Mov", "VV" },
 	{ IROp::Vec4Add, "Vec4Add", "VVV" },
 	{ IROp::Vec4Sub, "Vec4Sub", "VVV" },
@@ -218,6 +224,11 @@ int IRWriter::AddConstantFloat(float value) {
 	return AddConstant(val);
 }
 
+void IRWriter::ReplaceConstant(size_t instNumber, u32 newConstant) {
+	_dbg_assert_(instNumber < insts_.size());
+	insts_[instNumber].constant = newConstant;
+}
+
 static std::string GetGPRName(int r) {
 	if (r < 32) {
 		return currentDebugMIPS->GetRegName(0, r);
@@ -293,10 +304,13 @@ void DisassembleParam(char *buf, int bufSize, u8 param, char type, u32 constant)
 		}
 		break;
 	case 'C':
-		snprintf(buf, bufSize, "%08x", constant);
+		snprintf(buf, bufSize, "0x%08x", constant);
+		break;
+	case 'c':
+		snprintf(buf, bufSize, "0x%02x", constant);
 		break;
 	case 'I':
-		snprintf(buf, bufSize, "%02x", param);
+		snprintf(buf, bufSize, "0x%02x", param);
 		break;
 	case 'm':
 		snprintf(buf, bufSize, "%d", param);

diff --git a/Core/MIPS/IR/IRInst.h b/Core/MIPS/IR/IRInst.h
@@ -17,6 +17,9 @@
 // even be directly JIT-ed, but the gains will probably be tiny over our older direct
 // MIPS->target JITs.
 
+// Ops beginning with "OI" are specialized for IR Interpreter use. These will not be produced
+// for the IR JITs.
+
 enum class IROp : uint8_t {
 	SetConst,
 	SetConstF,
@@ -33,11 +36,14 @@ enum class IROp : uint8_t {
 	Xor,
 
 	AddConst,
+	OptAddConst,
 	SubConst,
 
 	AndConst,
 	OrConst,
 	XorConst,
+	OptAndConst,
+	OptOrConst,
 
 	Shl,
 	Shr,
@@ -133,6 +139,7 @@ enum class IROp : uint8_t {
 
 	FMovFromGPR,
 	FMovToGPR,
+	OptFMovToGPRShr8,
 
 	FSat0_1,
 	FSatMinus1_1,
@@ -391,6 +398,7 @@ class IRWriter {
 	void Clear() {
 		insts_.clear();
 	}
+	void ReplaceConstant(size_t instNumber, u32 newConstant);
 
 	const std::vector<IRInst> &GetInstructions() const { return insts_; }
 

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
@@ -120,15 +120,24 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 		case IROp::AddConst:
 			mips->r[inst->dest] = mips->r[inst->src1] + inst->constant;
 			break;
+		case IROp::OptAddConst:  // For this one, it's worth having a "unary" variant of the above that only needs to read one register param.
+			mips->r[inst->dest] += inst->constant;
+			break;
 		case IROp::SubConst:
 			mips->r[inst->dest] = mips->r[inst->src1] - inst->constant;
 			break;
 		case IROp::AndConst:
 			mips->r[inst->dest] = mips->r[inst->src1] & inst->constant;
 			break;
+		case IROp::OptAndConst:  // For this one, it's worth having a "unary" variant of the above that only needs to read one register param.
+			mips->r[inst->dest] &= inst->constant;
+			break;
 		case IROp::OrConst:
 			mips->r[inst->dest] = mips->r[inst->src1] | inst->constant;
 			break;
+		case IROp::OptOrConst:
+			mips->r[inst->dest] |= inst->constant;
+			break;
 		case IROp::XorConst:
 			mips->r[inst->dest] = mips->r[inst->src1] ^ inst->constant;
 			break;
@@ -431,6 +440,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 
 		case IROp::Vec2Pack31To16:
 		{
+			// Used in Tekken 6
+
 			u32 val = (mips->fi[inst->src1] >> 15) & 0xFFFF;
 			val |= (mips->fi[inst->src1 + 1] << 1) & 0xFFFF0000;
 			mips->fi[inst->dest] = val;
@@ -451,6 +462,8 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 
 		case IROp::Vec4Pack31To8:
 		{
+			// Used in Tekken 6
+
 			// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.
 			// pshufb or SSE4 instructions can be used instead.
 			u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;
@@ -987,7 +1000,13 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 		case IROp::FMovToGPR:
 			memcpy(&mips->r[inst->dest], &mips->f[inst->src1], 4);
 			break;
-
+		case IROp::OptFMovToGPRShr8:
+		{
+			u32 temp;
+			memcpy(&temp, &mips->f[inst->src1], 4);
+			mips->r[inst->dest] = temp >> 8;
+			break;
+		}
 		case IROp::ExitToConst:
 			return inst->constant;
 

diff --git a/Core/MIPS/IR/IRJit.cpp b/Core/MIPS/IR/IRJit.cpp
@@ -255,15 +255,19 @@ void IRJit::RunLoopUntil(u64 globalticks) {
 			u32 opcode = inst & 0xFF000000;
 			if (opcode == MIPS_EMUHACK_OPCODE) {
 				u32 offset = inst & 0x00FFFFFF; // Alternatively, inst - opcode
+				const IRInst *instPtr = blocks_.GetArenaPtr() + offset;
+				_dbg_assert_(instPtr->op == IROp::Downcount);
+				mips->downcount -= instPtr->constant;
+				instPtr++;
 #ifdef IR_PROFILING
 				IRBlock *block = blocks_.GetBlock(blocks_.GetBlockNumFromOffset(offset));
 				TimeSpan span;
-				mips->pc = IRInterpret(mips, blocks_.GetArenaPtr() + offset);
+				mips->pc = IRInterpret(mips, instPtr);
 				int64_t elapsedNanos = span.ElapsedNanos();
 				block->profileStats_.executions += 1;
 				block->profileStats_.totalNanos += elapsedNanos;
 #else
-				mips->pc = IRInterpret(mips, blocks_.GetArenaPtr() + offset);
+				mips->pc = IRInterpret(mips, instPtr);
 #endif
 				// Note: this will "jump to zero" on a badly constructed block missing exits.
 				if (!Memory::IsValid4AlignedAddress(mips->pc)) {

diff --git a/Core/MIPS/IR/IRPassSimplify.cpp b/Core/MIPS/IR/IRPassSimplify.cpp
@@ -2149,3 +2149,110 @@ bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts) {
 	}
 	return logBlocks;
 }
+
+// This optimizes away redundant loads-after-stores, which are surprisingly not that uncommon.
+bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts) {
+	CONDITIONAL_DISABLE;
+	// This tells us to skip an AND op that has been optimized out.
+	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
+	int nextSkip = -1;
+
+	bool logBlocks = false;
+	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
+		IRInst inst = in.GetInstructions()[i];
+
+		// Just copy the last instruction.
+		if (i == n - 1) {
+			out.Write(inst);
+			break;
+		}
+
+		out.Write(inst);
+
+		IRInst next = in.GetInstructions()[i + 1];
+		switch (inst.op) {
+		case IROp::Store32:
+			if (next.op == IROp::Load32 &&
+				next.constant == inst.constant &&
+				next.dest == inst.src3 &&
+				next.src1 == inst.src1) {
+				// The upcoming load is completely redundant.
+				// Skip it.
+				i++;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	return logBlocks;
+}
+
+bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts) {
+	CONDITIONAL_DISABLE;
+	// This tells us to skip an AND op that has been optimized out.
+	// Maybe we could skip multiple, but that'd slow things down and is pretty uncommon.
+	int nextSkip = -1;
+
+	bool logBlocks = false;
+	// We also move the downcount to the top so the interpreter can assume that it's there.
+	bool foundDowncount = false;
+	out.Write(IROp::Downcount);
+
+	for (int i = 0, n = (int)in.GetInstructions().size(); i < n; i++) {
+		IRInst inst = in.GetInstructions()[i];
+
+		bool last = i == n - 1;
+
+		// Specialize some instructions.
+		switch (inst.op) {
+		case IROp::Downcount:
+			if (!foundDowncount) {
+				// Move the value into the initial Downcount.
+				foundDowncount = true;
+				out.ReplaceConstant(0, inst.constant);
+			} else {
+				// Already had a downcount. Let's just re-emit it.
+				out.Write(inst);
+			}
+			break;
+		case IROp::AddConst:
+			if (inst.src1 == inst.dest) {
+				inst.op = IROp::OptAddConst;
+			}
+			out.Write(inst);
+			break;
+		case IROp::AndConst:
+			if (inst.src1 == inst.dest) {
+				inst.op = IROp::OptAndConst;
+			}
+			out.Write(inst);
+			break;
+		case IROp::OrConst:
+			if (inst.src1 == inst.dest) {
+				inst.op = IROp::OptOrConst;
+			}
+			out.Write(inst);
+			break;
+		case IROp::FMovToGPR:
+			if (!last) {
+				IRInst next = in.GetInstructions()[i + 1];
+				if (next.op == IROp::ShrImm && next.src2 == 8 && next.src1 == next.dest && next.src1 == inst.dest) {
+					// Heavily used when writing display lists.
+					inst.op = IROp::OptFMovToGPRShr8;
+					i++;  // Skip the next instruction.
+				}
+				out.Write(inst);
+			} else {
+				out.Write(inst);
+			}
+			break;
+		default:
+			out.Write(inst);
+			break;
+		}
+	}
+
+	return logBlocks;
+}
diff --git a/Core/MIPS/IR/IRPassSimplify.h b/Core/MIPS/IR/IRPassSimplify.h
@@ -16,3 +16,6 @@ bool ReorderLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts);
 bool MergeLoadStore(const IRWriter &in, IRWriter &out, const IROptions &opts);
 bool ApplyMemoryValidation(const IRWriter &in, IRWriter &out, const IROptions &opts);
 bool ReduceVec4Flush(const IRWriter &in, IRWriter &out, const IROptions &opts);
+
+bool OptimizeLoadsAfterStores(const IRWriter &in, IRWriter &out, const IROptions &opts);
+bool OptimizeForInterpreter(const IRWriter &in, IRWriter &out, const IROptions &opts);