Skip to content

Commit

Permalink
Minor improvements to floating point conversion functions
Browse files Browse the repository at this point in the history
  • Loading branch information
fo76utils committed Nov 29, 2024
1 parent 8848b00 commit 541648a
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 78 deletions.
1 change: 0 additions & 1 deletion lib/libfo76utils/src/bsmatcdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,6 @@ void BSMaterialsCDB::loadItem(
static_cast< CDBObject_Float * >(o)->value = buf2.readFloat();
break;
case BSReflStream::String_Double:
// FIXME: implement this in a portable way
static_cast< CDBObject_Double * >(o)->value =
std::bit_cast< double, std::uint64_t >(buf2.readUInt64());
break;
Expand Down
15 changes: 1 addition & 14 deletions lib/libfo76utils/src/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ NifSkopeError::~NifSkopeError() noexcept
#if ENABLE_X86_64_SIMD < 3
std::uint16_t convertToFloat16(float x)
{
# if defined(__i386__) || defined(__x86_64__) || defined(__x86_64)
std::uint32_t n = std::bit_cast< std::uint32_t >(x);
std::uint32_t s = (n & 0x80000000U) >> 16;
n = n & 0x7FFFFFFFU;
Expand All @@ -66,21 +65,9 @@ std::uint16_t convertToFloat16(float x)
x = std::bit_cast< float >(n);
return std::uint16_t(std::uint32_t(roundFloat(x * float(1 << 24))) | s);
}
n = (n - 0x37FFF000U) >> 13;
n = (n + ((n & 0x00002000U) >> 13) - 0x37FFF001U) >> 13;
n = (n < 0x7FFFU ? n : 0x7FFFU);
return std::uint16_t(n | s);
# else
int e = 0;
int m = roundFloat(float(std::frexp(x, &e)) * 2048.0f);
if (!m)
return 0;
int s = m & 0x8000;
m = std::abs(m);
e = e + 14 + (m >> 11);
if (e <= 0 || e > 31)
return std::uint16_t(s | (e <= 0 ? 0x0000 : 0x7FFF));
return std::uint16_t(s | (e << 10) | (m & 0x03FF));
# endif
}
#endif

Expand Down
10 changes: 2 additions & 8 deletions lib/libfo76utils/src/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,24 +131,18 @@ inline float convertFloat16(unsigned short n)
std::int16_t(n), 0, 0, 0, 0, 0, 0, 0
};
return __builtin_ia32_vcvtph2ps(tmp)[0];
#elif defined(__i386__) || defined(__x86_64__) || defined(__x86_64)
#else
std::uint32_t m = (std::uint32_t) int((std::int16_t) n);
std::uint32_t i = ((m << 13) & 0x8FFFE000U) + 0x38000000U;
float r = std::bit_cast< float >(i);
if (!(m & 0x7C00U)) [[unlikely]]
{
// zero or denormal
i = std::bit_cast< std::uint32_t >(r) & 0xFF800000U;
i = i & 0xFF800000U;
r = r - std::bit_cast< float >(i);
r = r + r;
}
return r;
#else
unsigned char e = (unsigned char) ((n >> 10) & 0x1F);
if (!e)
return 0.0f;
long long m = (long long) ((n & 0x03FF) | 0x0400) << e;
return (float(!(n & 0x8000) ? m : -m) * (1.0f / 33554432.0f));
#endif
}

Expand Down
11 changes: 0 additions & 11 deletions lib/libfo76utils/src/filebuf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,20 +78,9 @@ float FileBuffer::readFloat()
if ((filePos + 4) > fileBufSize)
errorMessage("end of input file");
std::uint32_t tmp = readUInt32Fast();
#if defined(__i386__) || defined(__x86_64__) || defined(__x86_64)
if (!((tmp + 0x00800000U) & 0x7F000000U))
return 0.0f;
return std::bit_cast< float, std::uint32_t >(tmp);
#else
int e = int((tmp >> 23) & 0xFF);
if (e == 0x00 || e == 0xFF)
return 0.0f;
double m = double(int((tmp & 0x007FFFFF) | 0x00800000));
m = std::ldexp(m, e - 150);
if (tmp & 0x80000000U)
m = -m;
return float(m);
#endif
}

FloatVector4 FileBuffer::readFloatVector4()
Expand Down
2 changes: 2 additions & 0 deletions lib/libfo76utils/src/fp32vec4.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ struct FloatVector4
const std::uint32_t *p1_3, const std::uint32_t *p2_3,
float xf, float yf, bool isSRGB = false);
static inline FloatVector4 convertInt16(const std::uint64_t& n);
static inline FloatVector4 convertInt32(const std::int32_t *p);
inline void convertToInt32(std::int32_t *p);
// if noInfNaN is true, Inf and NaN values are never returned
static inline FloatVector4 convertFloat16(std::uint64_t n,
bool noInfNaN = false);
Expand Down
13 changes: 13 additions & 0 deletions lib/libfo76utils/src/fp32vec4_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n)
float(std::int16_t((n >> 48) & 0xFFFFU)));
}

inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p)
{
return FloatVector4(float(p[0]), float(p[1]), float(p[2]), float(p[3]));
}

inline void FloatVector4::convertToInt32(std::int32_t *p)
{
p[0] = std::int32_t(roundFloat(v[0]));
p[1] = std::int32_t(roundFloat(v[1]));
p[2] = std::int32_t(roundFloat(v[2]));
p[3] = std::int32_t(roundFloat(v[3]));
}

inline FloatVector4 FloatVector4::convertFloat16(std::uint64_t n, bool noInfNaN)
{
(void) noInfNaN;
Expand Down
59 changes: 41 additions & 18 deletions lib/libfo76utils/src/fp32vec4_clang.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,24 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n)
return FloatVector4(v);
}

inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p)
{
XMM_Int32 tmp = { p[0], p[1], p[2], p[3] };
XMM_Float v;
__asm__ ("vcvtdq2ps %1, %0" : "=x" (v) : "x" (tmp));
return FloatVector4(v);
}

inline void FloatVector4::convertToInt32(std::int32_t *p)
{
XMM_Int32 tmp;
__asm__ ("vcvtps2dq %1, %0" : "=x" (tmp) : "xm" (v));
p[0] = tmp[0];
p[1] = tmp[1];
p[2] = tmp[2];
p[3] = tmp[3];
}

inline FloatVector4 FloatVector4::convertFloat16(
std::uint64_t n, [[maybe_unused]] bool noInfNaN)
{
Expand Down Expand Up @@ -87,33 +105,38 @@ inline FloatVector4 FloatVector4::convertFloat16(

inline std::uint64_t FloatVector4::convertToFloat16(unsigned int mask) const
{
#if ENABLE_X86_64_SIMD >= 3
XMM_UInt64 tmp;
XMM_Float x = v;
if (mask < 15U)
{
__asm__ ("vmovd %1, %0" : "=x" (tmp) : "r" (mask * 0x10204080U));
__asm__ ("vpmovsxbd %0, %0" : "+x" (tmp));
__asm__ ("vpsrad $0x1f, %0, %0" : "+x" (tmp));
__asm__ ("vpand %1, %0, %0" : "+x" (tmp) : "xm" (v));
__asm__ ("vcvtps2ph $0x00, %0, %0" : "+x" (tmp));
}
else
{
__asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (v));
__asm__ ("vpand %1, %0, %0" : "+x" (x) : "x" (tmp));
}
return tmp[0];
#if ENABLE_X86_64_SIMD >= 3
__asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (x));
#else
std::uint64_t r = 0U;
if (mask & 1U)
r = ::convertToFloat16(v[0]);
if (mask & 2U)
r = r | (std::uint64_t(::convertToFloat16(v[1])) << 16);
if (mask & 4U)
r = r | (std::uint64_t(::convertToFloat16(v[2])) << 32);
if (mask & 8U)
r = r | (std::uint64_t(::convertToFloat16(v[3])) << 48);
return r;
XMM_UInt32 n = std::bit_cast< XMM_UInt32 >(x);
XMM_UInt32 s = { 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U };
s = s & n;
n = n ^ s;
XMM_UInt32 d = { 0x4B800000U, 0x4B800000U, 0x4B800000U, 0x4B800000U };
XMM_UInt32 m = { 0x000007FFU, 0x000007FFU, 0x000007FFU, 0x000007FFU };
__asm__ ("vmulps %1, %0, %0" : "+x" (d) : "x" (n));
__asm__ ("vcvtps2dq %0, %0" : "+x" (d));
d = d & m;
XMM_UInt32 offs = { 0x37FFF001U, 0x37FFF001U, 0x37FFF001U, 0x37FFF001U };
XMM_UInt32 r = { 0x00002000U, 0x00002000U, 0x00002000U, 0x00002000U };
r = (r & n) >> 13;
n = n + r - offs;
__asm__ ("vpsrad $0x0d, %0, %0" : "+x" (n));
__asm__ ("vpackssdw %0, %0, %0" : "+x" (s));
__asm__ ("vpmaxsd %1, %0, %0" : "+x" (n) : "x" (d));
__asm__ ("vpackssdw %0, %0, %0" : "+x" (n));
__asm__ ("vpor %2, %1, %0" : "=x" (tmp) : "x" (n), "x" (s));
#endif
return tmp[0];
}

inline void FloatVector4::convertToFloats(float *p) const
Expand Down
56 changes: 38 additions & 18 deletions lib/libfo76utils/src/fp32vec4_gcc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,21 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n)
return FloatVector4(__builtin_ia32_cvtdq2ps(v));
}

inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p)
{
XMM_Int32 v = { p[0], p[1], p[2], p[3] };
return FloatVector4(__builtin_ia32_cvtdq2ps(v));
}

inline void FloatVector4::convertToInt32(std::int32_t *p)
{
XMM_Int32 tmp = __builtin_ia32_cvtps2dq(v);
p[0] = tmp[0];
p[1] = tmp[1];
p[2] = tmp[2];
p[3] = tmp[3];
}

inline FloatVector4 FloatVector4::convertFloat16(
std::uint64_t n, [[maybe_unused]] bool noInfNaN)
{
Expand Down Expand Up @@ -86,33 +101,38 @@ inline FloatVector4 FloatVector4::convertFloat16(

inline std::uint64_t FloatVector4::convertToFloat16(unsigned int mask) const
{
#if ENABLE_X86_64_SIMD >= 3
XMM_UInt64 tmp;
XMM_Float x = v;
if (mask < 15U)
{
__asm__ ("vmovd %1, %0" : "=x" (tmp) : "r" (mask * 0x10204080U));
__asm__ ("vpmovsxbd %0, %0" : "+x" (tmp));
__asm__ ("vpsrad $0x1f, %0, %0" : "+x" (tmp));
__asm__ ("vpand %1, %0, %0" : "+x" (tmp) : "xm" (v));
__asm__ ("vcvtps2ph $0x00, %0, %0" : "+x" (tmp));
}
else
{
__asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (v));
__asm__ ("vpand %1, %0, %0" : "+x" (x) : "x" (tmp));
}
return tmp[0];
#if ENABLE_X86_64_SIMD >= 3
__asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (x));
#else
std::uint64_t r = 0U;
if (mask & 1U)
r = ::convertToFloat16(v[0]);
if (mask & 2U)
r = r | (std::uint64_t(::convertToFloat16(v[1])) << 16);
if (mask & 4U)
r = r | (std::uint64_t(::convertToFloat16(v[2])) << 32);
if (mask & 8U)
r = r | (std::uint64_t(::convertToFloat16(v[3])) << 48);
return r;
XMM_UInt32 n = std::bit_cast< XMM_UInt32 >(x);
XMM_UInt32 s = { 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U };
s = s & n;
n = n ^ s;
XMM_UInt32 d = { 0x4B800000U, 0x4B800000U, 0x4B800000U, 0x4B800000U };
XMM_UInt32 m = { 0x000007FFU, 0x000007FFU, 0x000007FFU, 0x000007FFU };
__asm__ ("vmulps %1, %0, %0" : "+x" (d) : "x" (n));
__asm__ ("vcvtps2dq %0, %0" : "+x" (d));
d = d & m;
XMM_UInt32 offs = { 0x37FFF001U, 0x37FFF001U, 0x37FFF001U, 0x37FFF001U };
XMM_UInt32 r = { 0x00002000U, 0x00002000U, 0x00002000U, 0x00002000U };
r = (r & n) >> 13;
n = n + r - offs;
__asm__ ("vpsrad $0x0d, %0, %0" : "+x" (n));
__asm__ ("vpackssdw %0, %0, %0" : "+x" (s));
__asm__ ("vpmaxsd %1, %0, %0" : "+x" (n) : "x" (d));
__asm__ ("vpackssdw %0, %0, %0" : "+x" (n));
__asm__ ("vpor %2, %1, %0" : "=x" (tmp) : "x" (n), "x" (s));
#endif
return tmp[0];
}

inline void FloatVector4::convertToFloats(float *p) const
Expand Down
18 changes: 10 additions & 8 deletions lib/libfo76utils/src/sfcube2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -519,8 +519,8 @@ void SFCubeMapCache::convertHDRToDDSThread(
{
int n = yStart / cubeWidth;
int y = yStart % cubeWidth;
FloatVector4 xi_v(0.0f);
FloatVector4 yi_v(0.0f);
std::int32_t xi_v[4];
std::int32_t yi_v[4];
FloatVector4 xf_v(0.0f);
FloatVector4 yf_v(0.0f);
for (int x = 0; x < cubeWidth; x++, p = p + outPixelSize)
Expand All @@ -546,13 +546,15 @@ void SFCubeMapCache::convertHDRToDDSThread(
FloatVector4 xf = atan2NormFast(tmpX, tmpY) * 0.5f + 0.5f;
xf = xf * float(w) - 0.5f;
yf = yf * float(h) - 0.5f;
xi_v = FloatVector4(xf).floorValues();
yi_v = FloatVector4(yf).floorValues();
xf_v = xf - xi_v;
yf_v = yf - yi_v;
FloatVector4 xi = FloatVector4(xf).floorValues();
FloatVector4 yi = FloatVector4(yf).floorValues();
xi.convertToInt32(xi_v);
yi.convertToInt32(yi_v);
xf_v = xf - xi;
yf_v = yf - yi;
}
int x0 = int(xi_v[x & 3]);
int y0 = int(yi_v[x & 3]);
int x0 = xi_v[x & 3];
int y0 = yi_v[x & 3];
float xf = xf_v[x & 3];
float yf = yf_v[x & 3];
x0 = (x0 <= (w - 1) ? (x0 >= 0 ? x0 : (w - 1)) : 0);
Expand Down

0 comments on commit 541648a

Please sign in to comment.