Minor improvements to floating point conversion functions

fo76utils · Nov 29, 2024 · 541648a · 541648a
1 parent 8848b00
commit 541648a
Show file tree

Hide file tree

Showing 9 changed files with 107 additions and 78 deletions.
diff --git a/lib/libfo76utils/src/bsmatcdb.cpp b/lib/libfo76utils/src/bsmatcdb.cpp
@@ -659,7 +659,6 @@ void BSMaterialsCDB::loadItem(
       static_cast< CDBObject_Float * >(o)->value = buf2.readFloat();
       break;
     case BSReflStream::String_Double:
-      // FIXME: implement this in a portable way
       static_cast< CDBObject_Double * >(o)->value =
           std::bit_cast< double, std::uint64_t >(buf2.readUInt64());
       break;

diff --git a/lib/libfo76utils/src/common.cpp b/lib/libfo76utils/src/common.cpp
@@ -57,7 +57,6 @@ NifSkopeError::~NifSkopeError() noexcept
 #if ENABLE_X86_64_SIMD < 3
 std::uint16_t convertToFloat16(float x)
 {
-#  if defined(__i386__) || defined(__x86_64__) || defined(__x86_64)
   std::uint32_t n = std::bit_cast< std::uint32_t >(x);
   std::uint32_t s = (n & 0x80000000U) >> 16;
   n = n & 0x7FFFFFFFU;
@@ -66,21 +65,9 @@ std::uint16_t convertToFloat16(float x)
     x = std::bit_cast< float >(n);
     return std::uint16_t(std::uint32_t(roundFloat(x * float(1 << 24))) | s);
   }
-  n = (n - 0x37FFF000U) >> 13;
+  n = (n + ((n & 0x00002000U) >> 13) - 0x37FFF001U) >> 13;
   n = (n < 0x7FFFU ? n : 0x7FFFU);
   return std::uint16_t(n | s);
-#  else
-  int     e = 0;
-  int     m = roundFloat(float(std::frexp(x, &e)) * 2048.0f);
-  if (!m)
-    return 0;
-  int     s = m & 0x8000;
-  m = std::abs(m);
-  e = e + 14 + (m >> 11);
-  if (e <= 0 || e > 31)
-    return std::uint16_t(s | (e <= 0 ? 0x0000 : 0x7FFF));
-  return std::uint16_t(s | (e << 10) | (m & 0x03FF));
-#  endif
 }
 #endif
 

diff --git a/lib/libfo76utils/src/common.hpp b/lib/libfo76utils/src/common.hpp
@@ -131,24 +131,18 @@ inline float convertFloat16(unsigned short n)
     std::int16_t(n), 0, 0, 0, 0, 0, 0, 0
   };
   return __builtin_ia32_vcvtph2ps(tmp)[0];
-#elif defined(__i386__) || defined(__x86_64__) || defined(__x86_64)
+#else
   std::uint32_t m = (std::uint32_t) int((std::int16_t) n);
   std::uint32_t i = ((m << 13) & 0x8FFFE000U) + 0x38000000U;
   float   r = std::bit_cast< float >(i);
   if (!(m & 0x7C00U)) [[unlikely]]
   {
     // zero or denormal
-    i = std::bit_cast< std::uint32_t >(r) & 0xFF800000U;
+    i = i & 0xFF800000U;
     r = r - std::bit_cast< float >(i);
     r = r + r;
   }
   return r;
-#else
-  unsigned char e = (unsigned char) ((n >> 10) & 0x1F);
-  if (!e)
-    return 0.0f;
-  long long m = (long long) ((n & 0x03FF) | 0x0400) << e;
-  return (float(!(n & 0x8000) ? m : -m) * (1.0f / 33554432.0f));
 #endif
 }
 

diff --git a/lib/libfo76utils/src/filebuf.cpp b/lib/libfo76utils/src/filebuf.cpp
@@ -78,20 +78,9 @@ float FileBuffer::readFloat()
   if ((filePos + 4) > fileBufSize)
     errorMessage("end of input file");
   std::uint32_t tmp = readUInt32Fast();
-#if defined(__i386__) || defined(__x86_64__) || defined(__x86_64)
   if (!((tmp + 0x00800000U) & 0x7F000000U))
     return 0.0f;
   return std::bit_cast< float, std::uint32_t >(tmp);
-#else
-  int     e = int((tmp >> 23) & 0xFF);
-  if (e == 0x00 || e == 0xFF)
-    return 0.0f;
-  double  m = double(int((tmp & 0x007FFFFF) | 0x00800000));
-  m = std::ldexp(m, e - 150);
-  if (tmp & 0x80000000U)
-    m = -m;
-  return float(m);
-#endif
 }
 
 FloatVector4 FileBuffer::readFloatVector4()

diff --git a/lib/libfo76utils/src/fp32vec4.hpp b/lib/libfo76utils/src/fp32vec4.hpp
@@ -76,6 +76,8 @@ struct FloatVector4
                       const std::uint32_t *p1_3, const std::uint32_t *p2_3,
                       float xf, float yf, bool isSRGB = false);
   static inline FloatVector4 convertInt16(const std::uint64_t& n);
+  static inline FloatVector4 convertInt32(const std::int32_t *p);
+  inline void convertToInt32(std::int32_t *p);
   // if noInfNaN is true, Inf and NaN values are never returned
   static inline FloatVector4 convertFloat16(std::uint64_t n,
                                             bool noInfNaN = false);

diff --git a/lib/libfo76utils/src/fp32vec4_base.hpp b/lib/libfo76utils/src/fp32vec4_base.hpp
@@ -64,6 +64,19 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n)
                       float(std::int16_t((n >> 48) & 0xFFFFU)));
 }
 
+inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p)
+{
+  return FloatVector4(float(p[0]), float(p[1]), float(p[2]), float(p[3]));
+}
+
+inline void FloatVector4::convertToInt32(std::int32_t *p)
+{
+  p[0] = std::int32_t(roundFloat(v[0]));
+  p[1] = std::int32_t(roundFloat(v[1]));
+  p[2] = std::int32_t(roundFloat(v[2]));
+  p[3] = std::int32_t(roundFloat(v[3]));
+}
+
 inline FloatVector4 FloatVector4::convertFloat16(std::uint64_t n, bool noInfNaN)
 {
   (void) noInfNaN;

diff --git a/lib/libfo76utils/src/fp32vec4_clang.hpp b/lib/libfo76utils/src/fp32vec4_clang.hpp
@@ -49,6 +49,24 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n)
   return FloatVector4(v);
 }
 
+inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p)
+{
+  XMM_Int32 tmp = { p[0], p[1], p[2], p[3] };
+  XMM_Float v;
+  __asm__ ("vcvtdq2ps %1, %0" : "=x" (v) : "x" (tmp));
+  return FloatVector4(v);
+}
+
+inline void FloatVector4::convertToInt32(std::int32_t *p)
+{
+  XMM_Int32 tmp;
+  __asm__ ("vcvtps2dq %1, %0" : "=x" (tmp) : "xm" (v));
+  p[0] = tmp[0];
+  p[1] = tmp[1];
+  p[2] = tmp[2];
+  p[3] = tmp[3];
+}
+
 inline FloatVector4 FloatVector4::convertFloat16(
     std::uint64_t n, [[maybe_unused]] bool noInfNaN)
 {
@@ -87,33 +105,38 @@ inline FloatVector4 FloatVector4::convertFloat16(
 
 inline std::uint64_t FloatVector4::convertToFloat16(unsigned int mask) const
 {
-#if ENABLE_X86_64_SIMD >= 3
   XMM_UInt64  tmp;
+  XMM_Float   x = v;
   if (mask < 15U)
   {
     __asm__ ("vmovd %1, %0" : "=x" (tmp) : "r" (mask * 0x10204080U));
     __asm__ ("vpmovsxbd %0, %0" : "+x" (tmp));
     __asm__ ("vpsrad $0x1f, %0, %0" : "+x" (tmp));
-    __asm__ ("vpand %1, %0, %0" : "+x" (tmp) : "xm" (v));
-    __asm__ ("vcvtps2ph $0x00, %0, %0" : "+x" (tmp));
-  }
-  else
-  {
-    __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (v));
+    __asm__ ("vpand %1, %0, %0" : "+x" (x) : "x" (tmp));
   }
-  return tmp[0];
+#if ENABLE_X86_64_SIMD >= 3
+  __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (x));
 #else
-  std::uint64_t r = 0U;
-  if (mask & 1U)
-    r = ::convertToFloat16(v[0]);
-  if (mask & 2U)
-    r = r | (std::uint64_t(::convertToFloat16(v[1])) << 16);
-  if (mask & 4U)
-    r = r | (std::uint64_t(::convertToFloat16(v[2])) << 32);
-  if (mask & 8U)
-    r = r | (std::uint64_t(::convertToFloat16(v[3])) << 48);
-  return r;
+  XMM_UInt32  n = std::bit_cast< XMM_UInt32 >(x);
+  XMM_UInt32  s = { 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U };
+  s = s & n;
+  n = n ^ s;
+  XMM_UInt32  d = { 0x4B800000U, 0x4B800000U, 0x4B800000U, 0x4B800000U };
+  XMM_UInt32  m = { 0x000007FFU, 0x000007FFU, 0x000007FFU, 0x000007FFU };
+  __asm__ ("vmulps %1, %0, %0" : "+x" (d) : "x" (n));
+  __asm__ ("vcvtps2dq %0, %0" : "+x" (d));
+  d = d & m;
+  XMM_UInt32  offs = { 0x37FFF001U, 0x37FFF001U, 0x37FFF001U, 0x37FFF001U };
+  XMM_UInt32  r = { 0x00002000U, 0x00002000U, 0x00002000U, 0x00002000U };
+  r = (r & n) >> 13;
+  n = n + r - offs;
+  __asm__ ("vpsrad $0x0d, %0, %0" : "+x" (n));
+  __asm__ ("vpackssdw %0, %0, %0" : "+x" (s));
+  __asm__ ("vpmaxsd %1, %0, %0" : "+x" (n) : "x" (d));
+  __asm__ ("vpackssdw %0, %0, %0" : "+x" (n));
+  __asm__ ("vpor %2, %1, %0" : "=x" (tmp) : "x" (n), "x" (s));
 #endif
+  return tmp[0];
 }
 
 inline void FloatVector4::convertToFloats(float *p) const

diff --git a/lib/libfo76utils/src/fp32vec4_gcc.hpp b/lib/libfo76utils/src/fp32vec4_gcc.hpp
@@ -49,6 +49,21 @@ inline FloatVector4 FloatVector4::convertInt16(const std::uint64_t& n)
   return FloatVector4(__builtin_ia32_cvtdq2ps(v));
 }
 
+inline FloatVector4 FloatVector4::convertInt32(const std::int32_t *p)
+{
+  XMM_Int32 v = { p[0], p[1], p[2], p[3] };
+  return FloatVector4(__builtin_ia32_cvtdq2ps(v));
+}
+
+inline void FloatVector4::convertToInt32(std::int32_t *p)
+{
+  XMM_Int32 tmp = __builtin_ia32_cvtps2dq(v);
+  p[0] = tmp[0];
+  p[1] = tmp[1];
+  p[2] = tmp[2];
+  p[3] = tmp[3];
+}
+
 inline FloatVector4 FloatVector4::convertFloat16(
     std::uint64_t n, [[maybe_unused]] bool noInfNaN)
 {
@@ -86,33 +101,38 @@ inline FloatVector4 FloatVector4::convertFloat16(
 
 inline std::uint64_t FloatVector4::convertToFloat16(unsigned int mask) const
 {
-#if ENABLE_X86_64_SIMD >= 3
   XMM_UInt64  tmp;
+  XMM_Float   x = v;
   if (mask < 15U)
   {
     __asm__ ("vmovd %1, %0" : "=x" (tmp) : "r" (mask * 0x10204080U));
     __asm__ ("vpmovsxbd %0, %0" : "+x" (tmp));
     __asm__ ("vpsrad $0x1f, %0, %0" : "+x" (tmp));
-    __asm__ ("vpand %1, %0, %0" : "+x" (tmp) : "xm" (v));
-    __asm__ ("vcvtps2ph $0x00, %0, %0" : "+x" (tmp));
-  }
-  else
-  {
-    __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (v));
+    __asm__ ("vpand %1, %0, %0" : "+x" (x) : "x" (tmp));
   }
-  return tmp[0];
+#if ENABLE_X86_64_SIMD >= 3
+  __asm__ ("vcvtps2ph $0x00, %1, %0" : "=x" (tmp) : "x" (x));
 #else
-  std::uint64_t r = 0U;
-  if (mask & 1U)
-    r = ::convertToFloat16(v[0]);
-  if (mask & 2U)
-    r = r | (std::uint64_t(::convertToFloat16(v[1])) << 16);
-  if (mask & 4U)
-    r = r | (std::uint64_t(::convertToFloat16(v[2])) << 32);
-  if (mask & 8U)
-    r = r | (std::uint64_t(::convertToFloat16(v[3])) << 48);
-  return r;
+  XMM_UInt32  n = std::bit_cast< XMM_UInt32 >(x);
+  XMM_UInt32  s = { 0x80000000U, 0x80000000U, 0x80000000U, 0x80000000U };
+  s = s & n;
+  n = n ^ s;
+  XMM_UInt32  d = { 0x4B800000U, 0x4B800000U, 0x4B800000U, 0x4B800000U };
+  XMM_UInt32  m = { 0x000007FFU, 0x000007FFU, 0x000007FFU, 0x000007FFU };
+  __asm__ ("vmulps %1, %0, %0" : "+x" (d) : "x" (n));
+  __asm__ ("vcvtps2dq %0, %0" : "+x" (d));
+  d = d & m;
+  XMM_UInt32  offs = { 0x37FFF001U, 0x37FFF001U, 0x37FFF001U, 0x37FFF001U };
+  XMM_UInt32  r = { 0x00002000U, 0x00002000U, 0x00002000U, 0x00002000U };
+  r = (r & n) >> 13;
+  n = n + r - offs;
+  __asm__ ("vpsrad $0x0d, %0, %0" : "+x" (n));
+  __asm__ ("vpackssdw %0, %0, %0" : "+x" (s));
+  __asm__ ("vpmaxsd %1, %0, %0" : "+x" (n) : "x" (d));
+  __asm__ ("vpackssdw %0, %0, %0" : "+x" (n));
+  __asm__ ("vpor %2, %1, %0" : "=x" (tmp) : "x" (n), "x" (s));
 #endif
+  return tmp[0];
 }
 
 inline void FloatVector4::convertToFloats(float *p) const

diff --git a/lib/libfo76utils/src/sfcube2.cpp b/lib/libfo76utils/src/sfcube2.cpp
@@ -519,8 +519,8 @@ void SFCubeMapCache::convertHDRToDDSThread(
   {
     int     n = yStart / cubeWidth;
     int     y = yStart % cubeWidth;
-    FloatVector4  xi_v(0.0f);
-    FloatVector4  yi_v(0.0f);
+    std::int32_t  xi_v[4];
+    std::int32_t  yi_v[4];
     FloatVector4  xf_v(0.0f);
     FloatVector4  yf_v(0.0f);
     for (int x = 0; x < cubeWidth; x++, p = p + outPixelSize)
@@ -546,13 +546,15 @@ void SFCubeMapCache::convertHDRToDDSThread(
         FloatVector4  xf = atan2NormFast(tmpX, tmpY) * 0.5f + 0.5f;
         xf = xf * float(w) - 0.5f;
         yf = yf * float(h) - 0.5f;
-        xi_v = FloatVector4(xf).floorValues();
-        yi_v = FloatVector4(yf).floorValues();
-        xf_v = xf - xi_v;
-        yf_v = yf - yi_v;
+        FloatVector4  xi = FloatVector4(xf).floorValues();
+        FloatVector4  yi = FloatVector4(yf).floorValues();
+        xi.convertToInt32(xi_v);
+        yi.convertToInt32(yi_v);
+        xf_v = xf - xi;
+        yf_v = yf - yi;
       }
-      int     x0 = int(xi_v[x & 3]);
-      int     y0 = int(yi_v[x & 3]);
+      int     x0 = xi_v[x & 3];
+      int     y0 = yi_v[x & 3];
       float   xf = xf_v[x & 3];
       float   yf = yf_v[x & 3];
       x0 = (x0 <= (w - 1) ? (x0 >= 0 ? x0 : (w - 1)) : 0);