WIP: conversion of subnormals to efloat

stillwater-sc · Nov 1, 2024 · 307ab83 · 307ab83
1 parent 9b45f14
commit 307ab83
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 29 deletions.
diff --git a/elastic/efloat/api/api.cpp b/elastic/efloat/api/api.cpp
@@ -58,7 +58,7 @@ try {
 		float_decoder d;
 		d.parts.sign = false;
 		d.parts.exponent = ieee754_parameter<float>::bias + 64;
-		d.parts.fraction = 0x7FFFFu << 8;   // these are just the fraction bits, no hidden bit
+		d.parts.fraction = 0x7F'FF00u;   // these are just the 23 fraction bits, no hidden bit
 		std::cout << "fraction bits  : " << to_binary(d.parts.fraction, true) << '\n';
 		float f = d.f;
 		std::cout << "floating point : " << to_binary(f, true) << " : " << f << '\n';
@@ -72,9 +72,36 @@ try {
 	}
 
 	// default behavior
-	std::cout << "+---------    Default efloat has no subnormals, no supernormals and is not saturating\n";
+	std::cout << "+---------    Default efloat has no subnormals\n";
 	{
+		using TestType = efloat;
+
+		// create a subnormal
+		float v;
+		setFields(v, false, 0u, 0x00'0001u); // smallest subnormal single precision float
+//		bool s{ false };
+//		uint32_t e{ 0 };
+//		uint32_t f{ 0 };
+//		uint32_t bits{ 0 };
+//		extractFields(v, s, e, f, bits);
+		std::cout << "subnormal      : " << to_binary(v) << " : " << v << '\n';
 
+		TestType a{ v };
+
+		std::cout << "efloat triple  : " << to_triple(a) << " : " << a.significant() << " : " << double(a) << '\n';
+		std::cout << "sign           : " << sign(a) << '\n';
+		std::cout << "scale          : 2^" << scale(a) << '\n';
+		std::cout << "significant    : " << significant<float>(a) << "f\n";
+
+		double dv;
+		setFields(dv, true, 0ull, 0x1ull);
+		std::cout << "floating point : " << to_binary(dv, true) << " : " << dv << '\n';
+		a = dv;
+
+		std::cout << "efloat triple  : " << to_triple(a) << " : " << a.significant() << " : " << double(a) << '\n';
+		std::cout << "sign           : " << sign(a) << '\n';
+		std::cout << "scale          : 2^" << scale(a) << '\n';
+		std::cout << "significant    : " << significant<float>(a) << "f\n";
 	}
 
 	// explicit configuration

diff --git a/include/universal/native/extract_fields.hpp b/include/universal/native/extract_fields.hpp
@@ -13,19 +13,8 @@ namespace sw { namespace universal {
 #if BIT_CAST_IS_CONSTEXPR
 #include <bit>    // C++20 bit_cast
 
-	template<typename Real>
-	inline BIT_CAST_CONSTEXPR void extractFields(Real value, bool& s, uint64_t& rawExponentBits, uint64_t& rawFractionBits, uint64_t& bits) noexcept {
-		if (value == 0) {
-			s = false;
-			rawExponentBits = 0ull;
-			rawFractionBits = 0ull;
-		}
-		if (value < 0) s = true;
-	}
-
 	// specialization to extract fields from a float
-	template<>
-	inline BIT_CAST_CONSTEXPR void extractFields(float value, bool& s, uint64_t& rawExponentBits, uint64_t& rawFractionBits, uint64_t& bits) noexcept {
+	inline BIT_CAST_CONSTEXPR void extractFields(float value, bool& s, uint32_t& rawExponentBits, uint32_t& rawFractionBits, uint32_t& bits) noexcept {
 		uint32_t bc = std::bit_cast<uint32_t, float>(value);
 		s = (ieee754_parameter<float>::smask & bc);
 		rawExponentBits = (ieee754_parameter<float>::emask & bc) >> ieee754_parameter<float>::fbits;
@@ -34,7 +23,6 @@ namespace sw { namespace universal {
 	}
 
 	// specialization to extract fields from a double
-	template<>
 	inline BIT_CAST_CONSTEXPR void extractFields(double value, bool& s, uint64_t& rawExponentBits, uint64_t& rawFractionBits, uint64_t& bits) noexcept {
 		uint64_t bc = std::bit_cast<uint64_t, double>(value);
 		s = (ieee754_parameter<double>::smask & bc);
@@ -48,7 +36,7 @@ namespace sw { namespace universal {
 // Clang bit_cast<> can't deal with long double
 
 #if defined(LONG_DOUBLE_DOWNCAST)
-	template<>
+
 	inline BIT_CAST_CONSTEXPR void extractFields(long double value, bool& s, uint64_t& rawExponentBits, uint64_t& rawFractionBits, uint64_t& bits) noexcept {
 		double d = static_cast<double>(value);
 		uint64_t bc = std::bit_cast<uint64_t, double>(d);
@@ -61,7 +49,7 @@ namespace sw { namespace universal {
 /*
 	ETLO 8/1/2024: not able to make std::bit_cast<> work for long double
 	// specialization to extract fields from a long double
-	template<>
+
 	inline BIT_CAST_CONSTEXPR void extractFields(long double value, bool& s, uint64_t& rawExponentBits, uint64_t& rawFractionBits, uint64_t& bits) noexcept {
 		struct blob {
 			std::uint64_t hi;
@@ -92,12 +80,12 @@ namespace sw { namespace universal {
 ////////////////////////////////////////////////////////////////////////
 // nonconst extractFields for single precision floating-point
 
-	inline void extractFields(float value, bool& s, uint64_t& rawExponentBits, uint64_t& rawFractionBits, uint64_t& bits) noexcept {
+	inline void extractFields(float value, bool& s, uint32_t& rawExponentBits, uint32_t& rawFractionBits, uint32_t& bits) noexcept {
 		float_decoder decoder;
 		decoder.f = value;
 		s = decoder.parts.sign ? true : false;
-		rawExponentBits = static_cast<uint64_t>(decoder.parts.exponent);
-		rawFractionBits = static_cast<uint64_t>(decoder.parts.fraction);
+		rawExponentBits = decoder.parts.exponent;
+		rawFractionBits = decoder.parts.fraction;
 		bits = uint64_t(decoder.bits);
 	}
 

diff --git a/include/universal/native/set_fields.hpp b/include/universal/native/set_fields.hpp
@@ -28,6 +28,28 @@ namespace sw { namespace universal {
 		v = std::bit_cast<double, uint64_t>(raw);
 	}
 
+	////////////////////////////////////////////////////////////////////////
+	// constexpr setFields on single precision floating-point
+
+	inline void setFields(float& v, bool s, uint32_t rawExponentBits, uint32_t rawFractionBits) noexcept {
+		uint32_t raw = (rawExponentBits & 0xFF) << 23;
+		raw |= (rawFractionBits & 0x7FFFFF);
+		uint32_t mask = 0x8000'0000;
+		if (s) raw |= mask;
+		v = std::bit_cast<float, uint32_t>(raw);
+	}
+
+	////////////////////////////////////////////////////////////////////////
+	// constexpr setFields on double precision floating-point
+
+	inline void setFields(double& v, bool s, uint64_t rawExponentBits, uint64_t rawFractionBits) noexcept {
+		uint64_t raw = (rawExponentBits & 0xFF) << 52;
+		raw |= (rawFractionBits & 0xF'FFFF'FFFF'FFFF);
+		uint64_t mask = 0x8000'0000'0000'0000;
+		if (s) raw |= mask;
+		v = std::bit_cast<double, uint64_t>(raw);
+	}
+
 #if LONG_DOUBLE_SUPPORT
 
 // Clang bit_cast<> can't deal with long double
@@ -129,7 +151,7 @@ namespace sw { namespace universal {
 ////////////////////////////////////////////////////////////////////////
 // nonconst setFields on single precision floating-point
 
-	inline void setFields(float& value, bool s, uint64_t rawExponentBits, uint64_t rawFractionBits) noexcept {
+	inline void setFields(float& value, bool s, uint32_t rawExponentBits, uint32_t rawFractionBits) noexcept {
 		float_decoder decoder;
 		decoder.parts.sign = s;
 		decoder.parts.exponent = rawExponentBits & 0xFF;

diff --git a/include/universal/number/efloat/attributes.hpp b/include/universal/number/efloat/attributes.hpp
@@ -19,6 +19,6 @@ namespace sw { namespace universal {
 
 	template<typename Real,
 		typename = typename std::enable_if< std::is_floating_point<Real>::value, Real >::type>
-	inline Real significant(const efloat& v) { return Real(v); }
+	inline Real significant(const efloat& v) { return static_cast<Real>(v.significant()); }
 
 }}  // namespace sw::universal
diff --git a/include/universal/number/efloat/efloat.hpp b/include/universal/number/efloat/efloat.hpp
@@ -7,6 +7,19 @@
 #ifndef _EFLOAT_STANDARD_HEADER_
 #define _EFLOAT_STANDARD_HEADER_
 
+////////////////////////////////////////////////////////////////////////////////////////
+///  COMPILATION DIRECTIVES TO DIFFERENT COMPILERS
+#include <universal/utility/compiler.hpp>
+#include <universal/utility/architecture.hpp>
+#include <universal/utility/bit_cast.hpp>
+#include <universal/utility/long_double.hpp>
+
+////////////////////////////////////////////////////////////////////////////////////////
+/// required std libraries 
+#include <iostream>
+#include <iomanip>
+#include <vector>
+
 ////////////////////////////////////////////////////////////////////////////////////////
 ///  BEHAVIORAL COMPILATION SWITCHES
 

diff --git a/include/universal/number/efloat/efloat_impl.hpp b/include/universal/number/efloat/efloat_impl.hpp
@@ -174,10 +174,10 @@ class efloat {
 	std::vector<uint32_t> bits() const { return _limb; }
 
 protected:
+	FloatingPointState    _state;    // exceptional state
 	bool                  _sign;     // sign of the number: -1 if true, +1 if false, zero is positive
 	int64_t               _exponent; // exponent of the number
 	std::vector<uint32_t> _limb;     // limbs of the representation
-	FloatingPointState    _state;    // exceptional state
 
 	// HELPER methods
 
@@ -210,16 +210,66 @@ class efloat {
 		typename = typename std::enable_if< std::is_floating_point<Real>::value, Real >::type>
 	efloat& convert_ieee754(Real rhs) noexcept {
 		clear();
+		bool isSubnormal{ false };
+		switch (std::fpclassify(rhs)) {
+		case FP_ZERO:
+			_state = FloatingPointState::Zero;
+			_sign = false;
+			_exponent = 0;
+			// stay limbless
+			return *this;
+		case FP_NAN:
+			_sign = sw::universal::sign(rhs);
+			_state = (_sign ? FloatingPointState::SignalingNaN : FloatingPointState::QuietNaN);
+			_exponent = 0;
+			// stay limbless
+			return *this;
+		case FP_INFINITE:
+			_state = FloatingPointState::Infinite;
+			_sign = false;
+			_exponent = 0;
+			// stay limbless
+			return *this;
+		case FP_SUBNORMAL:
+			isSubnormal = true;
+			break;
+		case FP_NORMAL:
+		default:
+			break;
+		}
+
 		_sign = sw::universal::sign(rhs);
-		_exponent = sw::universal::scale(rhs);
+		_exponent = sw::universal::scale(rhs); // scale already deals with subnormal numbers
 		if constexpr (sizeof(Real) == 4) {
-			uint32_t bits = sw::universal::_extractSignificant<uint32_t, Real>(rhs);
-			bits <<= 8; // 32 - 23 = 9 bits to get the hidden bit to land on bit 31
+			uint32_t bits{ 0 };
+			if (isSubnormal) { // subnormal number
+				bits = sw::universal::_extractFraction<uint32_t, Real>(rhs);
+				bits <<= 8; // 31 - 23 = 8 bits to get the hidden bit to land on bit 31
+				uint32_t mask = 0x8000'0000;
+				while ((mask & bits) == 0) {
+					bits <<= 1;
+				}
+			}
+			else {
+				bits = sw::universal::_extractSignificant<uint32_t, Real>(rhs);
+				bits <<= 8; // 31 - 23 = 8 bits to get the hidden bit to land on bit 31
+			}
 			_limb.push_back(bits);
 		}
 		else if constexpr (sizeof(Real) == 8) {
-			uint64_t bits = sw::universal::_extractSignificant<uint64_t, Real>(rhs);
-			bits <<= 11; // 64 - 52 = 12 bits to get the hidden bit to land on bit 63
+			uint64_t bits{ 0 };
+			if (isSubnormal) { // subnormal number
+				bits = sw::universal::_extractFraction<uint64_t, Real>(rhs);
+				bits <<= 11; // 63 - 52 = 11 bits to get the hidden bit to land on bit 63
+				uint64_t mask = 0x8000'0000'0000'0000;
+				while ((mask & bits) == 0) {
+					bits <<= 1;
+				}
+			}
+			else {
+				bits = sw::universal::_extractSignificant<uint64_t, Real>(rhs);
+				bits <<= 11; // 63 - 52 = 11 bits to get the hidden bit to land on bit 63
+			}
 			_limb.push_back(static_cast<uint32_t>(bits >> 32));
 			_limb.push_back(static_cast<uint32_t>(bits & 0xFFFF'FFFF));
 		}
@@ -248,7 +298,6 @@ class efloat {
 			v = (_sign ? -std::numeric_limits<Real>::infinity() : +std::numeric_limits<Real>::infinity());
 			break;
 		case FloatingPointState::Normal:
-			Real bla = Real(significant());
 			v = Real(sign()) * std::pow(Real(2.0), Real(scale())) * Real(significant());
 		}
 		return v;