Skip to content

Commit

Permalink
Avoid two-step rounding
Browse files Browse the repository at this point in the history
  • Loading branch information
Hagb committed Apr 14, 2024
1 parent a474285 commit d0cc048
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 102 deletions.
80 changes: 4 additions & 76 deletions src/dynarec/dynarec_arm_d9.c
Original file line number Diff line number Diff line change
Expand Up @@ -281,48 +281,21 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst,
CALL(arm_f2xm1, -1, 0);
#else
v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
if(!box86_dynarec_fastround) {
LDRH_IMM8(x1, xEmu, offsetof(x86emu_t, cw)); // hopefully cw is not too far for an imm8
UBFX(x1, x1, 10, 2); // extract round...
UBFX(x2, x1, 1, 1); // swap bits 0 and 1
BFI(x2, x1, 1, 1);
VMRS(x14); // get fpscr
MOV_REG(x3, x14);
}
if((PK(0)==0xD9 && PK(1)==0xE8) && // next inst is FLD1
(PK(2)==0xDE && PK(3)==0xC1)) {
MESSAGE(LOG_DUMP, "Hack for fld1 / faddp st1, st0\n");
if(!box86_dynarec_fastround) {
VCMP_F64_0(v1);
B_MARK(cGE); // if ST0 < 0 and if the rounding mode is toward 0, then use upward
TSTS_IMM8(x2, 0b01);
AND_IMM8_COND(cNE, x2, x2, 0b01); // 11 (TOWARDZERO) -> 01 (UPWARD), 01 -> 01
MARK;
BFI(x3, x2, 22, 2); // inject new round
VMSR(x3);
}

VMOV_64(0, v1);
CALL_1D(exp2, box86_dynarec_fastround ? 0 : (1 << x14)); // return is d0
CALL_1D(exp2, 0); // return is d0
VMOV_64(v1, 0);
addr+=4;
} else {
if(!box86_dynarec_fastround) {
BFI(x3, x2, 22, 2); // inject new round
VMSR(x3); // put new fpscr
}

//ST0.d = expm1(LN2 * ST0.d);
MOV32(x2, (&d_ln2));
VLDR_64(0, x2, 0);
VMUL_F64(0, 0, v1);
if(!box86_dynarec_fastround)
x87_setround(dyn, ninst, x1, x2, -1);
CALL_1D(expm1, box86_dynarec_fastround ? 0 : (1 << x14)); // return is d0
CALL_1D(expm1, 0); // return is d0
VMOV_64(v1, 0);
}
if(!box86_dynarec_fastround)
VMSR(x14);
#endif
// should set C1 to 0
break;
Expand All @@ -331,36 +304,9 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst,
v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D);

if(!box86_dynarec_fastround) {
LDRH_IMM8(x1, xEmu, offsetof(x86emu_t, cw)); // hopefully cw is not too far for an imm8
UBFX(x1, x1, 10, 2); // extract round...
VCMP_F64_0(v2);
VMRS_APSR();
B_MARK(cLT); // if ST1.d < 0 then don't swap bits 0 and 1
BFI(x1, x1, 2, 1); // if ST1.d >= 0 then swap bits 0 and 1
UBFX(x1, x1, 1, 2);
MARK;
s0 = fpu_get_scratch_double(dyn);
VMOV_i_64(s0, 0b01110000); // = 1.0
VCMP_F64(v1, s0);
VMRS_APSR();
B_MARK2(cGE); // if ST0 < 1 and if the rounding mode is toward 0, then use upward
TSTS_IMM8(x1, 0b01);
AND_IMM8_COND(cNE, x1, x1, 0b01); // 11 (TOWARDZERO) -> 01 (UPWARD), 01 -> 01
MARK2;
VMRS(x14); // get fpscr
MOV_REG(x3, x14);
BFI(x3, x1, 22, 2); // inject new round
VMSR(x3); // put new fpscr
}

VMOV_64(0, v1); // prepare call to log2
CALL_1D(log2, box86_dynarec_fastround ? 0 : (1 << x14));
if(!box86_dynarec_fastround)
x87_setround(dyn, ninst, x1, x2, -1);
CALL_1D(log2, 0);
VMUL_F64(v2, v2, 0); //ST(1).d = log2(ST0.d)*ST(1).d
if(!box86_dynarec_fastround)
VMSR(x14);
x87_do_pop(dyn, ninst, x3);
// should set C1 to 0
break;
Expand Down Expand Up @@ -515,32 +461,14 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst,
INST_NAME("FYL2XP1");
v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D);
if(!box86_dynarec_fastround) {
LDRH_IMM8(x1, xEmu, offsetof(x86emu_t, cw)); // hopefully cw is not too far for an imm8
UBFX(x1, x1, 10, 2); // extract round...
VCMP_F64_0(v2);
VMRS_APSR();
B_MARK(cLT); // if ST1.d < 0 then don't swap bits 0 and 1
BFI(x1, x1, 2, 1); // if ST1.d >= 0 then swap bits 0 and 1
UBFX(x1, x1, 1, 2);
MARK;
VMRS(x14); // get fpscr
MOV_REG(x3, x14);
BFI(x3, x1, 22, 2); // inject new round
VMSR(x3); // put new fpscr
}

//ST(1).d = (ST(1).d * log1p(ST0.d)) / M_LN2;
VMOV_64(0, v1); // prepare call to log1p
CALL_1D(log1p, box86_dynarec_fastround ? 0 : (1 << x14));
if(!box86_dynarec_fastround)
x87_setround(dyn, ninst, x1, x2, -1);
CALL_1D(log1p, 0);
VMUL_F64(v2, v2, 0);
MOV32(x2, (&d_ln2));
VLDR_64(0, x2, 0);
VDIV_F64(v2, v2, 0);
if(!box86_dynarec_fastround)
VMSR(x14);
x87_do_pop(dyn, ninst, x3);
// should set C1 to 0
break;
Expand Down
28 changes: 2 additions & 26 deletions src/emu/x86rund9.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,7 @@ uintptr_t RunD9(x86emu_t *emu, uintptr_t addr)
emu->sw.f.F87_C1 = 0;
break;
case 0xF1: /* FYL2X */
if (ST1.d < 0) {
switch (emu->cw.f.C87_RD) {
case ROUND_Up:
fesetround(FE_DOWNWARD);
break;
case ROUND_Down:
fesetround(FE_UPWARD);
}
}
if (ST0.d < 1 && emu->cw.f.C87_RD == ROUND_Chop)
fesetround(FE_UPWARD);
const double log2_st0 = log2(ST0.d);
setround(emu);
ST(1).d *= log2_st0;
ST(1).d *= log2(ST0.d);
fpu_do_pop(emu);
emu->sw.f.F87_C1 = 0;
break;
Expand Down Expand Up @@ -209,20 +196,9 @@ uintptr_t RunD9(x86emu_t *emu, uintptr_t addr)
emu->top=(emu->top+1)&7; // this will probably break a few things
break;
case 0xF9: /* FYL2XP1 */
if (ST1.d < 0) {
switch (emu->cw.f.C87_RD) {
case ROUND_Up:
fesetround(FE_DOWNWARD);
break;
case ROUND_Down:
fesetround(FE_UPWARD);
}
}
// Using the log1p instead of log2(ST0+1) can avoid losing precision much,
// expecially when ST0 is close to zero (which loses the precise when +1).
const double log1p_st0 = log1p(ST0.d);
setround(emu);
ST(1).d = (ST(1).d * log1p_st0) / M_LN2;
ST(1).d = (ST(1).d * log1p(ST0.d)) / M_LN2;
// = ST1 * log2(ST0 + 1) + error. (in math)
fpu_do_pop(emu);
emu->sw.f.F87_C1 = 0;
Expand Down

0 comments on commit d0cc048

Please sign in to comment.