Merge pull request #19299 from hrydgard/ir-interpreter-opt

IR Interpreter: Two small optimizations
hrydgard · Jun 24, 2024 · 867c1f2 · 867c1f2
2 parents 515cac7 + 982a83d
commit 867c1f2
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 7 deletions.
diff --git a/Common/Math/math_util.h b/Common/Math/math_util.h
@@ -110,6 +110,10 @@ inline bool my_isinf(float f) {
 		f2u.u == 0xff800000;
 }
 
+inline bool my_isinf_u(uint32_t u) {
+	return u == 0x7f800000 || u == 0xff800000;
+}
+
 inline bool my_isnan(float f) {
 	FP32 f2u;
 	f2u.f = f;

diff --git a/Core/MIPS/IR/IRCompALU.cpp b/Core/MIPS/IR/IRCompALU.cpp
@@ -192,8 +192,14 @@ void IRFrontend::CompShiftVar(MIPSOpcode op, IROp shiftOp) {
 	MIPSGPReg rd = _RD;
 	MIPSGPReg rt = _RT;
 	MIPSGPReg rs = _RS;
-	ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(31));
-	ir.Write(shiftOp, rd, rt, IRTEMP_0);
+
+	if (opts.optimizeForInterpreter) {
+		// The interpreter already masks where needed, don't need to generate extra ops.
+		ir.Write(shiftOp, rd, rt, rs);
+	} else {
+		ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(31));
+		ir.Write(shiftOp, rd, rt, IRTEMP_0);
+	}
 }
 
 void IRFrontend::Comp_ShiftType(MIPSOpcode op) {
@@ -246,6 +252,8 @@ void IRFrontend::Comp_Special3(MIPSOpcode op) {
 
 	case 0x4: //ins
 	{
+		// TODO: Might be good to support natively in the interpreter. Though, would have to
+		// abuse a register as a constant
 		u32 sourcemask = mask >> pos;
 		u32 destmask = ~(sourcemask << pos);
 

diff --git a/Core/MIPS/IR/IRInterpreter.cpp b/Core/MIPS/IR/IRInterpreter.cpp
@@ -524,11 +524,20 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 
 			// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.
 			// pshufb or SSE4 instructions can be used instead.
+#if PPSSPP_ARCH(ARM_NEON) && 0
+			// Untested
+			uint32x4_t value = vld1q_u32(&mips->fi[inst->src1]);
+			value = vshlq_n_u32(value, 1);
+			uint32x2_t halved = vshrn_n_u32(value, 8);
+			uint32x2_t halvedAgain = vshrn_n_u32(vcombine_u32(halved, vdup_n_u32(0)), 8);
+			mips->fi[inst->dest] = vget_lane_u32(halvedAgain, 0);
+#else
 			u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;
 			val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00;
 			val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000;
 			val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000;
 			mips->fi[inst->dest] = val;
+#endif
 			break;
 		}
 
@@ -627,7 +636,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 		{
 			// Not quickly implementable on all platforms, unfortunately.
 			// Though, this is still pretty fast compared to one split into multiple IR instructions.
-			// This might be good though: https://stackoverflow.com/a/17004629
+			// This might be good though: https://gist.github.com/rikusalminen/3040241
 			float dot = mips->f[inst->src1] * mips->f[inst->src2];
 			for (int i = 1; i < 4; i++)
 				dot += mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
@@ -820,8 +829,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 		}
 		case IROp::BSwap32:
 		{
-			u32 x = mips->r[inst->src1];
-			mips->r[inst->dest] = swap32(x);
+			mips->r[inst->dest] = swap32(mips->r[inst->src1]);
 			break;
 		}
 
@@ -832,12 +840,31 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
 			mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];
 			break;
 		case IROp::FMul:
-			if ((mips->f[inst->src2] == 0.0f && my_isinf(mips->f[inst->src1])) || (mips->f[inst->src1] == 0.0f && my_isinf(mips->f[inst->src2]))) {
+#if 1
+		{
+			float a = mips->f[inst->src1];
+			float b = mips->f[inst->src2];
+			if ((b == 0.0f && my_isinf(a)) || (a == 0.0f && my_isinf(b))) {
 				mips->fi[inst->dest] = 0x7fc00000;
 			} else {
-				mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
+				mips->f[inst->dest] = a * b;
 			}
+		}
 			break;
+#else
+			// Not sure if faster since it needs to load the operands twice? But the code is simpler.
+			{
+				// Takes care of negative zero by masking away the top bit, which also makes the inf check shorter.
+				u32 a = mips->fi[inst->src1] & 0x7FFFFFFF;
+				u32 b = mips->fi[inst->src2] & 0x7FFFFFFF;
+				if ((a == 0 && b == 0x7F800000) || (b == 0 && a == 0x7F800000)) {
+					mips->fi[inst->dest] = 0x7fc00000;
+				} else {
+					mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
+				}
+				break;
+			}
+#endif
 		case IROp::FDiv:
 			mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
 			break;