Skip to content

Commit

Permalink
Merge pull request #19299 from hrydgard/ir-interpreter-opt
Browse files Browse the repository at this point in the history
IR Interpreter: Two small optimizations
  • Loading branch information
hrydgard authored Jun 24, 2024
2 parents 515cac7 + 982a83d commit 867c1f2
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 7 deletions.
4 changes: 4 additions & 0 deletions Common/Math/math_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ inline bool my_isinf(float f) {
f2u.u == 0xff800000;
}

inline bool my_isinf_u(uint32_t u) {
return u == 0x7f800000 || u == 0xff800000;
}

inline bool my_isnan(float f) {
FP32 f2u;
f2u.f = f;
Expand Down
12 changes: 10 additions & 2 deletions Core/MIPS/IR/IRCompALU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,14 @@ void IRFrontend::CompShiftVar(MIPSOpcode op, IROp shiftOp) {
MIPSGPReg rd = _RD;
MIPSGPReg rt = _RT;
MIPSGPReg rs = _RS;
ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(31));
ir.Write(shiftOp, rd, rt, IRTEMP_0);

if (opts.optimizeForInterpreter) {
// The interpreter already masks where needed, don't need to generate extra ops.
ir.Write(shiftOp, rd, rt, rs);
} else {
ir.Write(IROp::AndConst, IRTEMP_0, rs, ir.AddConstant(31));
ir.Write(shiftOp, rd, rt, IRTEMP_0);
}
}

void IRFrontend::Comp_ShiftType(MIPSOpcode op) {
Expand Down Expand Up @@ -246,6 +252,8 @@ void IRFrontend::Comp_Special3(MIPSOpcode op) {

case 0x4: //ins
{
// TODO: Might be good to support natively in the interpreter. Though, would have to
// abuse a register as a constant
u32 sourcemask = mask >> pos;
u32 destmask = ~(sourcemask << pos);

Expand Down
37 changes: 32 additions & 5 deletions Core/MIPS/IR/IRInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -524,11 +524,20 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {

// Removed previous SSE code due to the need for unsigned 16-bit pack, which I'm too lazy to work around the lack of in SSE2.
// pshufb or SSE4 instructions can be used instead.
#if PPSSPP_ARCH(ARM_NEON) && 0
// Untested
uint32x4_t value = vld1q_u32(&mips->fi[inst->src1]);
value = vshlq_n_u32(value, 1);
uint32x2_t halved = vshrn_n_u32(value, 8);
uint32x2_t halvedAgain = vshrn_n_u32(vcombine_u32(halved, vdup_n_u32(0)), 8);
mips->fi[inst->dest] = vget_lane_u32(halvedAgain, 0);
#else
u32 val = (mips->fi[inst->src1] >> 23) & 0xFF;
val |= (mips->fi[inst->src1 + 1] >> 15) & 0xFF00;
val |= (mips->fi[inst->src1 + 2] >> 7) & 0xFF0000;
val |= (mips->fi[inst->src1 + 3] << 1) & 0xFF000000;
mips->fi[inst->dest] = val;
#endif
break;
}

Expand Down Expand Up @@ -627,7 +636,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
{
// Not quickly implementable on all platforms, unfortunately.
// Though, this is still pretty fast compared to one split into multiple IR instructions.
// This might be good though: https://stackoverflow.com/a/17004629
// This might be good though: https://gist.github.com/rikusalminen/3040241
float dot = mips->f[inst->src1] * mips->f[inst->src2];
for (int i = 1; i < 4; i++)
dot += mips->f[inst->src1 + i] * mips->f[inst->src2 + i];
Expand Down Expand Up @@ -820,8 +829,7 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
}
case IROp::BSwap32:
{
u32 x = mips->r[inst->src1];
mips->r[inst->dest] = swap32(x);
mips->r[inst->dest] = swap32(mips->r[inst->src1]);
break;
}

Expand All @@ -832,12 +840,31 @@ u32 IRInterpret(MIPSState *mips, const IRInst *inst) {
mips->f[inst->dest] = mips->f[inst->src1] - mips->f[inst->src2];
break;
case IROp::FMul:
if ((mips->f[inst->src2] == 0.0f && my_isinf(mips->f[inst->src1])) || (mips->f[inst->src1] == 0.0f && my_isinf(mips->f[inst->src2]))) {
#if 1
{
float a = mips->f[inst->src1];
float b = mips->f[inst->src2];
if ((b == 0.0f && my_isinf(a)) || (a == 0.0f && my_isinf(b))) {
mips->fi[inst->dest] = 0x7fc00000;
} else {
mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
mips->f[inst->dest] = a * b;
}
}
break;
#else
// Not sure if faster since it needs to load the operands twice? But the code is simpler.
{
// Takes care of negative zero by masking away the top bit, which also makes the inf check shorter.
u32 a = mips->fi[inst->src1] & 0x7FFFFFFF;
u32 b = mips->fi[inst->src2] & 0x7FFFFFFF;
if ((a == 0 && b == 0x7F800000) || (b == 0 && a == 0x7F800000)) {
mips->fi[inst->dest] = 0x7fc00000;
} else {
mips->f[inst->dest] = mips->f[inst->src1] * mips->f[inst->src2];
}
break;
}
#endif
case IROp::FDiv:
mips->f[inst->dest] = mips->f[inst->src1] / mips->f[inst->src2];
break;
Expand Down

0 comments on commit 867c1f2

Please sign in to comment.