Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add back our older VFPU approximations, as fallbacks if the table files are missing #17228

Merged
merged 4 commits into from
Apr 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2067,6 +2067,8 @@ add_library(${CoreLibName} ${CoreLinkType}
Core/MIPS/MIPSTables.h
Core/MIPS/MIPSVFPUUtils.cpp
Core/MIPS/MIPSVFPUUtils.h
Core/MIPS/MIPSVFPUFallbacks.cpp
Core/MIPS/MIPSVFPUFallbacks.h
Core/MIPS/MIPSAsm.cpp
Core/MIPS/MIPSAsm.h
Core/MemFault.cpp
Expand Down
2 changes: 2 additions & 0 deletions Core/Core.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,7 @@
<ClCompile Include="MIPS\IR\IRJit.cpp" />
<ClCompile Include="MIPS\IR\IRPassSimplify.cpp" />
<ClCompile Include="MIPS\IR\IRRegCache.cpp" />
<ClCompile Include="MIPS\MIPSVFPUFallbacks.cpp" />
<ClCompile Include="Replay.cpp" />
<ClCompile Include="Compatibility.cpp" />
<ClCompile Include="Config.cpp" />
Expand Down Expand Up @@ -1154,6 +1155,7 @@
<ClInclude Include="MIPS\IR\IRJit.h" />
<ClInclude Include="MIPS\IR\IRPassSimplify.h" />
<ClInclude Include="MIPS\IR\IRRegCache.h" />
<ClInclude Include="MIPS\MIPSVFPUFallbacks.h" />
<ClInclude Include="Replay.h" />
<ClInclude Include="Compatibility.h" />
<ClInclude Include="Config.h" />
Expand Down
6 changes: 6 additions & 0 deletions Core/Core.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -1192,6 +1192,9 @@
<ClCompile Include="TiltEventProcessor.cpp">
<Filter>Core</Filter>
</ClCompile>
<ClCompile Include="MIPS\MIPSVFPUFallbacks.cpp">
<Filter>MIPS</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="ELF\ElfReader.h">
Expand Down Expand Up @@ -1926,6 +1929,9 @@
<ClInclude Include="TiltEventProcessor.h">
<Filter>Core</Filter>
</ClInclude>
<ClInclude Include="MIPS\MIPSVFPUFallbacks.h">
<Filter>MIPS</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="..\LICENSE.TXT" />
Expand Down
312 changes: 312 additions & 0 deletions Core/MIPS/MIPSVFPUFallbacks.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,312 @@
#include <cmath>

#include "Common/BitScan.h"

#include "Core/MIPS/MIPSVFPUFallbacks.h"
#include "Core/MIPS/MIPSVFPUUtils.h"

// MIPSVFPUUtils now has the high precision instructions implemented by fp64
// in https://github.com/hrydgard/ppsspp/pull/16984 .
//
// These are our older approximations that are quite good but has flaws,
// but we need them to fall back to if the table files are missing.
//
// Note that currently, some of the new functions are not integrated in the JIT
// and are thus not normally used anyway.

// First the "trivial" fallbacks where we haven't done any accuracy work previously.

float vfpu_asin_fallback(float angle) {
return (float)(asinf(angle) / M_PI_2);
}

float vfpu_rcp_fallback(float x) {
return 1.0f / x;
}

float vfpu_log2_fallback(float x) {
return log2f(x);
}

float vfpu_exp2_fallback(float x) {
return exp2f(x);
}

// Flushes the angle to 0 if exponent smaller than this in vfpu_sin/vfpu_cos/vfpu_sincos.
// Was measured to be around 0x68, but GTA on Mac is somehow super sensitive
// to the shape of the sine curve which seem to be very slightly different.
//
// So setting a lower value.
#define PRECISION_EXP_THRESHOLD 0x65

union float2int {
uint32_t i;
float f;
};

float vfpu_sqrt_fallback(float a) {
float2int val;
val.f = a;

if ((val.i & 0xff800000) == 0x7f800000) {
if ((val.i & 0x007fffff) != 0) {
val.i = 0x7f800001;
}
return val.f;
}
if ((val.i & 0x7f800000) == 0) {
// Kill any sign.
val.i = 0;
return val.f;
}
if (val.i & 0x80000000) {
val.i = 0x7f800001;
return val.f;
}

int k = get_exp(val.i);
uint32_t sp = get_mant(val.i);
int less_bits = k & 1;
k >>= 1;

uint32_t z = 0x00C00000 >> less_bits;
int64_t halfsp = sp >> 1;
halfsp <<= 23 - less_bits;
for (int i = 0; i < 6; ++i) {
z = (z >> 1) + (uint32_t)(halfsp / z);
}

val.i = ((k + 127) << 23) | ((z << less_bits) & 0x007FFFFF);
// The lower two bits never end up set on the PSP, it seems like.
val.i &= 0xFFFFFFFC;

return val.f;
}

static inline uint32_t mant_mul(uint32_t a, uint32_t b) {
uint64_t m = (uint64_t)a * (uint64_t)b;
if (m & 0x007FFFFF) {
m += 0x01437000;
}
return (uint32_t)(m >> 23);
}

float vfpu_rsqrt_fallback(float a) {
float2int val;
val.f = a;

if (val.i == 0x7f800000) {
return 0.0f;
}
if ((val.i & 0x7fffffff) > 0x7f800000) {
val.i = (val.i & 0x80000000) | 0x7f800001;
return val.f;
}
if ((val.i & 0x7f800000) == 0) {
val.i = (val.i & 0x80000000) | 0x7f800000;
return val.f;
}
if (val.i & 0x80000000) {
val.i = 0xff800001;
return val.f;
}

int k = get_exp(val.i);
uint32_t sp = get_mant(val.i);
int less_bits = k & 1;
k = -(k >> 1);

uint32_t z = 0x00800000 >> less_bits;
uint32_t halfsp = sp >> (1 + less_bits);
for (int i = 0; i < 6; ++i) {
uint32_t zsq = mant_mul(z, z);
uint32_t correction = 0x00C00000 - mant_mul(halfsp, zsq);
z = mant_mul(z, correction);
}

int8_t shift = (int8_t)clz32_nonzero(z) - 8 + less_bits;
if (shift < 1) {
z >>= -shift;
k += -shift;
} else if (shift > 0) {
z <<= shift;
k -= shift;
}

z >>= less_bits;

val.i = ((k + 127) << 23) | (z & 0x007FFFFF);
val.i &= 0xFFFFFFFC;

return val.f;
}

float vfpu_sin_fallback(float a) {
float2int val;
val.f = a;

int32_t k = get_uexp(val.i);
if (k == 255) {
val.i = (val.i & 0xFF800001) | 1;
return val.f;
}

if (k < PRECISION_EXP_THRESHOLD) {
val.i &= 0x80000000;
return val.f;
}

// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, flip sign to inverse the wave.
if (k == 0x80 && mantissa >= (1 << 23)) {
val.i ^= 0x80000000;
mantissa -= 1 << 23;
}

int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;

if (k <= 0 || mantissa == 0) {
val.i &= 0x80000000;
return val.f;
}

// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
val.f = (float)sin((double)val.f * M_PI_2);
val.i &= 0xFFFFFFFC;
return val.f;
}

float vfpu_cos_fallback(float a) {
float2int val;
val.f = a;
bool negate = false;

int32_t k = get_uexp(val.i);
if (k == 255) {
// Note: unlike sin, cos always returns +NAN.
val.i = (val.i & 0x7F800001) | 1;
return val.f;
}

if (k < PRECISION_EXP_THRESHOLD)
return 1.0f;

// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, negate the result value.
if (k == 0x80 && mantissa >= (1 << 23)) {
mantissa -= 1 << 23;
negate = true;
}

int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;

if (k <= 0 || mantissa == 0)
return negate ? -1.0f : 1.0f;

// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
if (val.f == 1.0f || val.f == -1.0f) {
return negate ? 0.0f : -0.0f;
}
val.f = (float)cos((double)val.f * M_PI_2);
val.i &= 0xFFFFFFFC;
return negate ? -val.f : val.f;
}

void vfpu_sincos_fallback(float a, float &s, float &c) {
float2int val;
val.f = a;
// For sin, negate the input, for cos negate the output.
bool negate = false;

int32_t k = get_uexp(val.i);
if (k == 255) {
val.i = (val.i & 0xFF800001) | 1;
s = val.f;
val.i &= 0x7F800001;
c = val.f;
return;
}

if (k < PRECISION_EXP_THRESHOLD) {
val.i &= 0x80000000;
s = val.f;
c = 1.0f;
return;
}

// Okay, now modulus by 4 to begin with (identical wave every 4.)
int32_t mantissa = get_mant(val.i);
if (k > 0x80) {
const uint8_t over = k & 0x1F;
mantissa = (mantissa << over) & 0x00FFFFFF;
k = 0x80;
}
// This subtracts off the 2. If we do, flip signs.
if (k == 0x80 && mantissa >= (1 << 23)) {
mantissa -= 1 << 23;
negate = true;
}

int8_t norm_shift = mantissa == 0 ? 32 : (int8_t)clz32_nonzero(mantissa) - 8;
mantissa <<= norm_shift;
k -= norm_shift;

if (k <= 0 || mantissa == 0) {
val.i &= 0x80000000;
if (negate)
val.i ^= 0x80000000;
s = val.f;
c = negate ? -1.0f : 1.0f;
return;
}

// This is the value with modulus applied.
val.i = (val.i & 0x80000000) | (k << 23) | (mantissa & ~(1 << 23));
float2int i_sine, i_cosine;
if (val.f == 1.0f) {
i_sine.f = negate ? -1.0f : 1.0f;
i_cosine.f = negate ? 0.0f : -0.0f;
} else if (val.f == -1.0f) {
i_sine.f = negate ? 1.0f : -1.0f;
i_cosine.f = negate ? 0.0f : -0.0f;
} else if (negate) {
i_sine.f = (float)sin((double)-val.f * M_PI_2);
i_cosine.f = -(float)cos((double)val.f * M_PI_2);
} else {
double angle = (double)val.f * M_PI_2;
#if defined(__linux__)
double d_sine;
double d_cosine;
sincos(angle, &d_sine, &d_cosine);
i_sine.f = (float)d_sine;
i_cosine.f = (float)d_cosine;
#else
i_sine.f = (float)sin(angle);
i_cosine.f = (float)cos(angle);
#endif
}

i_sine.i &= 0xFFFFFFFC;
i_cosine.i &= 0xFFFFFFFC;
s = i_sine.f;
c = i_cosine.f;
return;
}
14 changes: 14 additions & 0 deletions Core/MIPS/MIPSVFPUFallbacks.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#pragma once

// These are our old implementation of VFPU math functions, that don't make use of the
// accuracy-improving tables from #16984.

float vfpu_asin_fallback(float angle);
float vfpu_sqrt_fallback(float a);
float vfpu_rsqrt_fallback(float a);
float vfpu_sin_fallback(float a);
float vfpu_cos_fallback(float a);
void vfpu_sincos_fallback(float a, float &s, float &c);
float vfpu_rcp_fallback(float x);
float vfpu_log2_fallback(float x);
float vfpu_exp2_fallback(float x);
Loading