diff --git a/src/dynarec/dynarec_arm_d9.c b/src/dynarec/dynarec_arm_d9.c
index 740ea50bd..819eb993c 100755
--- a/src/dynarec/dynarec_arm_d9.c
+++ b/src/dynarec/dynarec_arm_d9.c
@@ -281,48 +281,21 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst,
             CALL(arm_f2xm1, -1, 0);
             #else
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
-            if(!box86_dynarec_fastround) {
-                LDRH_IMM8(x1, xEmu, offsetof(x86emu_t, cw));    // hopefully cw is not too far for an imm8
-                UBFX(x1, x1, 10, 2);    // extract round...
-                UBFX(x2, x1, 1, 1);     // swap bits 0 and 1
-                BFI(x2, x1, 1, 1);
-                VMRS(x14);               // get fpscr
-                MOV_REG(x3, x14);
-            }
             if((PK(0)==0xD9 && PK(1)==0xE8) && // next inst is FLD1
             (PK(2)==0xDE && PK(3)==0xC1)) {
                 MESSAGE(LOG_DUMP, "Hack for fld1 / faddp st1, st0\n");
-                if(!box86_dynarec_fastround) {
-                    VCMP_F64_0(v1);
-                    B_MARK(cGE);           // if ST0 < 0 and if the rounding mode is toward 0, then use upward
-                    TSTS_IMM8(x2, 0b01);
-                    AND_IMM8_COND(cNE, x2, x2, 0b01); // 11 (TOWARDZERO) -> 01 (UPWARD), 01 -> 01
-                    MARK;
-                    BFI(x3, x2, 22, 2);     // inject new round
-                    VMSR(x3);
-                }
-
                 VMOV_64(0, v1);
-                CALL_1D(exp2, box86_dynarec_fastround ? 0 : (1 << x14));   // return is d0
+                CALL_1D(exp2, 0);   // return is d0
                 VMOV_64(v1, 0);
                 addr+=4;
             } else {
-                if(!box86_dynarec_fastround) {
-                    BFI(x3, x2, 22, 2);     // inject new round
-                    VMSR(x3);               // put new fpscr
-                }
-
                 //ST0.d = expm1(LN2 * ST0.d);
                 MOV32(x2, (&d_ln2));
                 VLDR_64(0, x2, 0);
                 VMUL_F64(0, 0, v1);
-                if(!box86_dynarec_fastround)
-                    x87_setround(dyn, ninst, x1, x2, -1);
-                CALL_1D(expm1, box86_dynarec_fastround ? 0 : (1 << x14));   // return is d0
+                CALL_1D(expm1, 0);   // return is d0
                 VMOV_64(v1, 0);
             }
-            if(!box86_dynarec_fastround)
-                VMSR(x14);
             #endif
             // should set C1 to 0
             break;
@@ -331,36 +304,9 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst,
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
             v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D);
 
-            if(!box86_dynarec_fastround) {
-                LDRH_IMM8(x1, xEmu, offsetof(x86emu_t, cw));    // hopefully cw is not too far for an imm8
-                UBFX(x1, x1, 10, 2);    // extract round...
-                VCMP_F64_0(v2);
-                VMRS_APSR();
-                B_MARK(cLT);            // if ST1.d < 0 then don't swap bits 0 and 1
-                BFI(x1, x1, 2, 1);      // if ST1.d >= 0 then swap bits 0 and 1
-                UBFX(x1, x1, 1, 2);
-                MARK;
-                s0 = fpu_get_scratch_double(dyn);
-                VMOV_i_64(s0, 0b01110000);   // = 1.0
-                VCMP_F64(v1, s0);
-                VMRS_APSR();
-                B_MARK2(cGE);           // if ST0 < 1 and if the rounding mode is toward 0, then use upward
-                TSTS_IMM8(x1, 0b01);
-                AND_IMM8_COND(cNE, x1, x1, 0b01); // 11 (TOWARDZERO) -> 01 (UPWARD), 01 -> 01
-                MARK2;
-                VMRS(x14);              // get fpscr
-                MOV_REG(x3, x14);
-                BFI(x3, x1, 22, 2);    // inject new round
-                VMSR(x3);               // put new fpscr
-            }
-
             VMOV_64(0, v1);    // prepare call to log2
-            CALL_1D(log2, box86_dynarec_fastround ? 0 : (1 << x14));
-            if(!box86_dynarec_fastround)
-                x87_setround(dyn, ninst, x1, x2, -1);
+            CALL_1D(log2, 0);
             VMUL_F64(v2, v2, 0);    //ST(1).d = log2(ST0.d)*ST(1).d
-            if(!box86_dynarec_fastround)
-                VMSR(x14);
             x87_do_pop(dyn, ninst, x3);
             // should set C1 to 0
             break;
@@ -515,32 +461,14 @@ uintptr_t dynarecD9(dynarec_arm_t* dyn, uintptr_t addr, uintptr_t ip, int ninst,
             INST_NAME("FYL2XP1");
             v1 = x87_get_st(dyn, ninst, x1, x2, 0, NEON_CACHE_ST_D);
             v2 = x87_get_st(dyn, ninst, x1, x2, 1, NEON_CACHE_ST_D);
-            if(!box86_dynarec_fastround) {
-                LDRH_IMM8(x1, xEmu, offsetof(x86emu_t, cw));    // hopefully cw is not too far for an imm8
-                UBFX(x1, x1, 10, 2);    // extract round...
-                VCMP_F64_0(v2);
-                VMRS_APSR();
-                B_MARK(cLT);            // if ST1.d < 0 then don't swap bits 0 and 1
-                BFI(x1, x1, 2, 1);      // if ST1.d >= 0 then swap bits 0 and 1
-                UBFX(x1, x1, 1, 2);
-                MARK;
-                VMRS(x14);              // get fpscr
-                MOV_REG(x3, x14);
-                BFI(x3, x1, 22, 2);    // inject new round
-                VMSR(x3);              // put new fpscr
-            }
 
             //ST(1).d = (ST(1).d * log1p(ST0.d)) / M_LN2;
             VMOV_64(0, v1);    // prepare call to log1p
-            CALL_1D(log1p, box86_dynarec_fastround ? 0 : (1 << x14));
-            if(!box86_dynarec_fastround)
-                x87_setround(dyn, ninst, x1, x2, -1);
+            CALL_1D(log1p, 0);
             VMUL_F64(v2, v2, 0);
             MOV32(x2, (&d_ln2));
             VLDR_64(0, x2, 0);
             VDIV_F64(v2, v2, 0);
-            if(!box86_dynarec_fastround)
-                VMSR(x14);
             x87_do_pop(dyn, ninst, x3);
             // should set C1 to 0
             break;
diff --git a/src/emu/x86rund9.c b/src/emu/x86rund9.c
index 87e74158c..aebc2c866 100755
--- a/src/emu/x86rund9.c
+++ b/src/emu/x86rund9.c
@@ -123,20 +123,7 @@ uintptr_t RunD9(x86emu_t *emu, uintptr_t addr)
             emu->sw.f.F87_C1 = 0;
             break;
         case 0xF1:  /* FYL2X */
-            if (ST1.d < 0) {
-                switch (emu->cw.f.C87_RD) {
-                    case ROUND_Up:
-                        fesetround(FE_DOWNWARD);
-                        break;
-                    case ROUND_Down:
-                        fesetround(FE_UPWARD);
-                }
-            }
-            if (ST0.d < 1 && emu->cw.f.C87_RD == ROUND_Chop)
-                fesetround(FE_UPWARD);
-            const double log2_st0 = log2(ST0.d);
-            setround(emu);
-            ST(1).d *= log2_st0;
+            ST(1).d *= log2(ST0.d);
             fpu_do_pop(emu);
             emu->sw.f.F87_C1 = 0;
             break;
@@ -209,20 +196,9 @@ uintptr_t RunD9(x86emu_t *emu, uintptr_t addr)
             emu->top=(emu->top+1)&7;    // this will probably break a few things
             break;
         case 0xF9:  /* FYL2XP1 */
-            if (ST1.d < 0) {
-                switch (emu->cw.f.C87_RD) {
-                    case ROUND_Up:
-                        fesetround(FE_DOWNWARD);
-                        break;
-                    case ROUND_Down:
-                        fesetround(FE_UPWARD);
-                }
-            }
             // Using the log1p instead of log2(ST0+1) can avoid losing precision much,
             // expecially when ST0 is close to zero (which loses the precise when +1).
-            const double log1p_st0 = log1p(ST0.d);
-            setround(emu);
-            ST(1).d = (ST(1).d * log1p_st0) / M_LN2;
+            ST(1).d = (ST(1).d * log1p(ST0.d)) / M_LN2;
             //      = ST1 * log2(ST0 + 1) + error. (in math)
             fpu_do_pop(emu);
             emu->sw.f.F87_C1 = 0;