diff --git a/lib/libfo76utils/src/bsmatcdb.cpp b/lib/libfo76utils/src/bsmatcdb.cpp index e636ae2a..7b4e104f 100644 --- a/lib/libfo76utils/src/bsmatcdb.cpp +++ b/lib/libfo76utils/src/bsmatcdb.cpp @@ -659,7 +659,6 @@ void BSMaterialsCDB::loadItem( static_cast< CDBObject_Float * >(o)->value = buf2.readFloat(); break; case BSReflStream::String_Double: - // FIXME: implement this in a portable way static_cast< CDBObject_Double * >(o)->value = std::bit_cast< double, std::uint64_t >(buf2.readUInt64()); break; diff --git a/lib/libfo76utils/src/common.cpp b/lib/libfo76utils/src/common.cpp index fd8eb161..29e7f6f9 100644 --- a/lib/libfo76utils/src/common.cpp +++ b/lib/libfo76utils/src/common.cpp @@ -57,7 +57,6 @@ NifSkopeError::~NifSkopeError() noexcept #if ENABLE_X86_64_SIMD < 3 std::uint16_t convertToFloat16(float x) { -# if defined(__i386__) || defined(__x86_64__) || defined(__x86_64) std::uint32_t n = std::bit_cast< std::uint32_t >(x); std::uint32_t s = (n & 0x80000000U) >> 16; n = n & 0x7FFFFFFFU; @@ -66,21 +65,9 @@ std::uint16_t convertToFloat16(float x) x = std::bit_cast< float >(n); return std::uint16_t(std::uint32_t(roundFloat(x * float(1 << 24))) | s); } - n = (n - 0x37FFF000U) >> 13; + n = (n + ((n & 0x00002000U) >> 13) - 0x37FFF001U) >> 13; n = (n < 0x7FFFU ? n : 0x7FFFU); return std::uint16_t(n | s); -# else - int e = 0; - int m = roundFloat(float(std::frexp(x, &e)) * 2048.0f); - if (!m) - return 0; - int s = m & 0x8000; - m = std::abs(m); - e = e + 14 + (m >> 11); - if (e <= 0 || e > 31) - return std::uint16_t(s | (e <= 0 ? 0x0000 : 0x7FFF)); - return std::uint16_t(s | (e << 10) | (m & 0x03FF)); -# endif } #endif diff --git a/lib/libfo76utils/src/common.hpp b/lib/libfo76utils/src/common.hpp index 571ee123..b686085e 100644 --- a/lib/libfo76utils/src/common.hpp +++ b/lib/libfo76utils/src/common.hpp @@ -131,24 +131,18 @@ inline float convertFloat16(unsigned short n) std::int16_t(n), 0, 0, 0, 0, 0, 0, 0 }; return __builtin_ia32_vcvtph2ps(tmp)[0]; -#elif defined(__i386__) || defined(__x86_64__) || defined(__x86_64) +#else std::uint32_t m = (std::uint32_t) int((std::int16_t) n); std::uint32_t i = ((m << 13) & 0x8FFFE000U) + 0x38000000U; float r = std::bit_cast< float >(i); if (!(m & 0x7C00U)) [[unlikely]] { // zero or denormal - i = std::bit_cast< std::uint32_t >(r) & 0xFF800000U; + i = i & 0xFF800000U; r = r - std::bit_cast< float >(i); r = r + r; } return r; -#else - unsigned char e = (unsigned char) ((n >> 10) & 0x1F); - if (!e) - return 0.0f; - long long m = (long long) ((n & 0x03FF) | 0x0400) << e; - return (float(!(n & 0x8000) ? m : -m) * (1.0f / 33554432.0f)); #endif } diff --git a/lib/libfo76utils/src/filebuf.cpp b/lib/libfo76utils/src/filebuf.cpp index 665f3433..ec5ecad7 100644 --- a/lib/libfo76utils/src/filebuf.cpp +++ b/lib/libfo76utils/src/filebuf.cpp @@ -78,20 +78,9 @@ float FileBuffer::readFloat() if ((filePos + 4) > fileBufSize) errorMessage("end of input file"); std::uint32_t tmp = readUInt32Fast(); -#if defined(__i386__) || defined(__x86_64__) || defined(__x86_64) if (!((tmp + 0x00800000U) & 0x7F000000U)) return 0.0f; return std::bit_cast< float, std::uint32_t >(tmp); -#else - int e = int((tmp >> 23) & 0xFF); - if (e == 0x00 || e == 0xFF) - return 0.0f; - double m = double(int((tmp & 0x007FFFFF) | 0x00800000)); - m = std::ldexp(m, e - 150); - if (tmp & 0x80000000U) - m = -m; - return float(m); -#endif } FloatVector4 FileBuffer::readFloatVector4() diff --git a/lib/libfo76utils/src/fp32vec4.hpp b/lib/libfo76utils/src/fp32vec4.hpp index d30546d2..8285d761 100644 --- a/lib/libfo76utils/src/fp32vec4.hpp +++ b/lib/libfo76utils/src/fp32vec4.hpp @@ -76,6 +76,8 @@ struct FloatVector4 const std::uint32_t *p1_3, const std::uint32_t *p2_3, float xf, float yf, bool isSRGB = false); static inline FloatVector4 convertInt16(const std::uint64_t& n); + static inline FloatVector4 convertInt32(const std::int32_t *p); + inline void convertToInt32(std::int32_t *p); // if noInfNaN is true, Inf and NaN values are never returned static inline FloatVector4 convertFloat16(std::uint64_t n, bool noInfNaN = false); diff --git a/lib/libfo76utils/src/fp32vec4_base.hpp b/lib/libfo76utils/src/fp32vec4_base.hpp index 4b86b0d5..8958a536 100644 --- a/lib/libfo76utils/src/fp32vec4_base.hpp +++ b/lib/libfo76utils/src/fp32vec4_base.hpp @@ -64,6 +64,19 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n) float(std::int16_t((n >> 48) & 0xFFFFU))); } +inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p) +{ + return FloatVector4(float(p[0]), float(p[1]), float(p[2]), float(p[3])); +} + +inline void FloatVector4::convertToInt32(std::int32_t *p) +{ + p[0] = std::int32_t(roundFloat(v[0])); + p[1] = std::int32_t(roundFloat(v[1])); + p[2] = std::int32_t(roundFloat(v[2])); + p[3] = std::int32_t(roundFloat(v[3])); +} + inline FloatVector4 FloatVector4::convertFloat16(std::uint64_t n, bool noInfNaN) { (void) noInfNaN; diff --git a/lib/libfo76utils/src/fp32vec4_clang.hpp b/lib/libfo76utils/src/fp32vec4_clang.hpp index 04945672..5221df1e 100644 --- a/lib/libfo76utils/src/fp32vec4_clang.hpp +++ b/lib/libfo76utils/src/fp32vec4_clang.hpp @@ -49,6 +49,24 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n) return FloatVector4(v); } +inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p) +{ + XMM_Int32 tmp = { p[0], p[1], p[2], p[3] }; + XMM_Float v; + __asm__ ("vcvtdq2ps %1, %0" : "=x" (v) : "x" (tmp)); + return FloatVector4(v); +} + +inline void FloatVector4::convertToInt32(std::int32_t *p) +{ + XMM_Int32 tmp; + __asm__ ("vcvtps2dq %1, %0" : "=x" (tmp) : "xm" (v)); + p[0] = tmp[0]; + p[1] = tmp[1]; + p[2] = tmp[2]; + p[3] = tmp[3]; +} + inline FloatVector4 FloatVector4::convertFloat16( std::uint64_t n, [[maybe_unused]] bool noInfNaN) { @@ -87,33 +105,38 @@ inline FloatVector4 FloatVector4::convertFloat16( inline std::uint64_t FloatVector4::convertToFloat16(unsigned int mask) const { -#if ENABLE_X86_64_SIMD >= 3 XMM_UInt64 tmp; + XMM_Float x = v; if (mask < 15U) { __asm__ ("vmovd %1, %0" : "=x" (tmp) : "r" (mask * 0x10204080U)); __asm__ ("vpmovsxbd %0, %0" : "+x" (tmp)); __asm__ ("vpsrad $0x1f, %0, %0" : "+x" (tmp)); - __asm__ ("vpand %1, %0, %0" : "+x" (tmp) : "xm" (v)); - __asm__ ("vcvtps2ph $0x00, %0, %0" : "+x" (tmp)); - } - else - { - __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (v)); + __asm__ ("vpand %1, %0, %0" : "+x" (x) : "x" (tmp)); } - return tmp[0]; +#if ENABLE_X86_64_SIMD >= 3 + __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (x)); #else - std::uint64_t r = 0U; - if (mask & 1U) - r = ::convertToFloat16(v[0]); - if (mask & 2U) - r = r | (std::uint64_t(::convertToFloat16(v[1])) << 16); - if (mask & 4U) - r = r | (std::uint64_t(::convertToFloat16(v[2])) << 32); - if (mask & 8U) - r = r | (std::uint64_t(::convertToFloat16(v[3])) << 48); - return r; + XMM_UInt32 n = std::bit_cast< XMM_UInt32 >(x); + XMM_UInt32 s = { 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }; + s = s & n; + n = n ^ s; + XMM_UInt32 d = { 0x4B800000U, 0x4B800000U, 0x4B800000U, 0x4B800000U }; + XMM_UInt32 m = { 0x000007FFU, 0x000007FFU, 0x000007FFU, 0x000007FFU }; + __asm__ ("vmulps %1, %0, %0" : "+x" (d) : "x" (n)); + __asm__ ("vcvtps2dq %0, %0" : "+x" (d)); + d = d & m; + XMM_UInt32 offs = { 0x37FFF001U, 0x37FFF001U, 0x37FFF001U, 0x37FFF001U }; + XMM_UInt32 r = { 0x00002000U, 0x00002000U, 0x00002000U, 0x00002000U }; + r = (r & n) >> 13; + n = n + r - offs; + __asm__ ("vpsrad $0x0d, %0, %0" : "+x" (n)); + __asm__ ("vpackssdw %0, %0, %0" : "+x" (s)); + __asm__ ("vpmaxsd %1, %0, %0" : "+x" (n) : "x" (d)); + __asm__ ("vpackssdw %0, %0, %0" : "+x" (n)); + __asm__ ("vpor %2, %1, %0" : "=x" (tmp) : "x" (n), "x" (s)); #endif + return tmp[0]; } inline void FloatVector4::convertToFloats(float *p) const diff --git a/lib/libfo76utils/src/fp32vec4_gcc.hpp b/lib/libfo76utils/src/fp32vec4_gcc.hpp index 4c950c79..ec89c722 100644 --- a/lib/libfo76utils/src/fp32vec4_gcc.hpp +++ b/lib/libfo76utils/src/fp32vec4_gcc.hpp @@ -49,6 +49,21 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n) return FloatVector4(__builtin_ia32_cvtdq2ps(v)); } +inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p) +{ + XMM_Int32 v = { p[0], p[1], p[2], p[3] }; + return FloatVector4(__builtin_ia32_cvtdq2ps(v)); +} + +inline void FloatVector4::convertToInt32(std::int32_t *p) +{ + XMM_Int32 tmp = __builtin_ia32_cvtps2dq(v); + p[0] = tmp[0]; + p[1] = tmp[1]; + p[2] = tmp[2]; + p[3] = tmp[3]; +} + inline FloatVector4 FloatVector4::convertFloat16( std::uint64_t n, [[maybe_unused]] bool noInfNaN) { @@ -86,33 +101,38 @@ inline FloatVector4 FloatVector4::convertFloat16( inline std::uint64_t FloatVector4::convertToFloat16(unsigned int mask) const { -#if ENABLE_X86_64_SIMD >= 3 XMM_UInt64 tmp; + XMM_Float x = v; if (mask < 15U) { __asm__ ("vmovd %1, %0" : "=x" (tmp) : "r" (mask * 0x10204080U)); __asm__ ("vpmovsxbd %0, %0" : "+x" (tmp)); __asm__ ("vpsrad $0x1f, %0, %0" : "+x" (tmp)); - __asm__ ("vpand %1, %0, %0" : "+x" (tmp) : "xm" (v)); - __asm__ ("vcvtps2ph $0x00, %0, %0" : "+x" (tmp)); - } - else - { - __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (v)); + __asm__ ("vpand %1, %0, %0" : "+x" (x) : "x" (tmp)); } - return tmp[0]; +#if ENABLE_X86_64_SIMD >= 3 + __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (x)); #else - std::uint64_t r = 0U; - if (mask & 1U) - r = ::convertToFloat16(v[0]); - if (mask & 2U) - r = r | (std::uint64_t(::convertToFloat16(v[1])) << 16); - if (mask & 4U) - r = r | (std::uint64_t(::convertToFloat16(v[2])) << 32); - if (mask & 8U) - r = r | (std::uint64_t(::convertToFloat16(v[3])) << 48); - return r; + XMM_UInt32 n = std::bit_cast< XMM_UInt32 >(x); + XMM_UInt32 s = { 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U }; + s = s & n; + n = n ^ s; + XMM_UInt32 d = { 0x4B800000U, 0x4B800000U, 0x4B800000U, 0x4B800000U }; + XMM_UInt32 m = { 0x000007FFU, 0x000007FFU, 0x000007FFU, 0x000007FFU }; + __asm__ ("vmulps %1, %0, %0" : "+x" (d) : "x" (n)); + __asm__ ("vcvtps2dq %0, %0" : "+x" (d)); + d = d & m; + XMM_UInt32 offs = { 0x37FFF001U, 0x37FFF001U, 0x37FFF001U, 0x37FFF001U }; + XMM_UInt32 r = { 0x00002000U, 0x00002000U, 0x00002000U, 0x00002000U }; + r = (r & n) >> 13; + n = n + r - offs; + __asm__ ("vpsrad $0x0d, %0, %0" : "+x" (n)); + __asm__ ("vpackssdw %0, %0, %0" : "+x" (s)); + __asm__ ("vpmaxsd %1, %0, %0" : "+x" (n) : "x" (d)); + __asm__ ("vpackssdw %0, %0, %0" : "+x" (n)); + __asm__ ("vpor %2, %1, %0" : "=x" (tmp) : "x" (n), "x" (s)); #endif + return tmp[0]; } inline void FloatVector4::convertToFloats(float *p) const diff --git a/lib/libfo76utils/src/sfcube2.cpp b/lib/libfo76utils/src/sfcube2.cpp index 95503de0..a3e7e087 100644 --- a/lib/libfo76utils/src/sfcube2.cpp +++ b/lib/libfo76utils/src/sfcube2.cpp @@ -519,8 +519,8 @@ void SFCubeMapCache::convertHDRToDDSThread( { int n = yStart / cubeWidth; int y = yStart % cubeWidth; - FloatVector4 xi_v(0.0f); - FloatVector4 yi_v(0.0f); + std::int32_t xi_v[4]; + std::int32_t yi_v[4]; FloatVector4 xf_v(0.0f); FloatVector4 yf_v(0.0f); for (int x = 0; x < cubeWidth; x++, p = p + outPixelSize) @@ -546,13 +546,15 @@ void SFCubeMapCache::convertHDRToDDSThread( FloatVector4 xf = atan2NormFast(tmpX, tmpY) * 0.5f + 0.5f; xf = xf * float(w) - 0.5f; yf = yf * float(h) - 0.5f; - xi_v = FloatVector4(xf).floorValues(); - yi_v = FloatVector4(yf).floorValues(); - xf_v = xf - xi_v; - yf_v = yf - yi_v; + FloatVector4 xi = FloatVector4(xf).floorValues(); + FloatVector4 yi = FloatVector4(yf).floorValues(); + xi.convertToInt32(xi_v); + yi.convertToInt32(yi_v); + xf_v = xf - xi; + yf_v = yf - yi; } - int x0 = int(xi_v[x & 3]); - int y0 = int(yi_v[x & 3]); + int x0 = xi_v[x & 3]; + int y0 = yi_v[x & 3]; float xf = xf_v[x & 3]; float yf = yf_v[x & 3]; x0 = (x0 <= (w - 1) ? (x0 >= 0 ? x0 : (w - 1)) : 0);