alpaka-group · bernhardmgruber · Nov 15, 2021 · Nov 12, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -73,6 +73,7 @@ if (LLAMA_BUILD_EXAMPLES)
 	add_subdirectory("examples/raycast")
 	add_subdirectory("examples/bitpack")
 	add_subdirectory("examples/bytesplit")
+	add_subdirectory("examples/floatpack")
 
 	# alpaka examples
 	find_package(alpaka 0.7.0 QUIET)

diff --git a/examples/bitpack/bitpack.cpp b/examples/bitpack/bitpack.cpp
@@ -1,166 +1,112 @@
+#include "../common/IntegralReference.hpp"
+
 #include <cstdint>
 #include <fmt/core.h>
 #include <llama/llama.hpp>
 
-// clang-format off
-namespace tag
+namespace mapping
 {
-    struct X{};
-    struct Y{};
-    struct Z{};
-} // namespace tag
-
-using Vector = llama::Record<
-    llama::Field<tag::X, std::uint16_t>,
-    llama::Field<tag::Y, std::int32_t>,
-    llama::Field<tag::Z, std::uint64_t>
->;
-// clang-format on
-
-template<
-    typename TArrayExtents,
-    typename TRecordDim,
-    typename LinearizeArrayDimsFunctor = llama::mapping::LinearizeArrayDimsCpp>
-struct BitpackSoA : TArrayExtents
-{
-    using ArrayExtents = TArrayExtents;
-    using ArrayIndex = typename ArrayExtents::Index;
-    using RecordDim = TRecordDim;
-
-    static constexpr std::size_t blobCount = boost::mp11::mp_size<llama::FlatRecordDim<RecordDim>>::value;
-
-    constexpr BitpackSoA() = default;
-
-    LLAMA_FN_HOST_ACC_INLINE
-    constexpr explicit BitpackSoA(unsigned bits, ArrayExtents extents, RecordDim = {})
-        : ArrayExtents(extents)
-        , bits{bits}
+    template<
+        typename TArrayExtents,
+        typename TRecordDim,
+        typename LinearizeArrayDimsFunctor = llama::mapping::LinearizeArrayDimsCpp>
+    struct BitpackSoA : TArrayExtents
     {
-    }
+        using ArrayExtents = TArrayExtents;
+        using ArrayIndex = typename ArrayExtents::Index;
+        using RecordDim = TRecordDim;
 
-    LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
-    {
-        return *this; // NOLINT(cppcoreguidelines-slicing)
-    }
+        static constexpr std::size_t blobCount = boost::mp11::mp_size<llama::FlatRecordDim<RecordDim>>::value;
 
-    LLAMA_FN_HOST_ACC_INLINE
-    constexpr auto blobSize(std::size_t /*blobIndex*/) const -> std::size_t
-    {
-        return (LinearizeArrayDimsFunctor{}.size(extents()) * bits + CHAR_BIT - 1) / CHAR_BIT;
-    }
+        using StoredIntegral
+            = std::uint64_t; // TODO(bgruber): we should choose an integral type which is as large as the
+                             // largest type in the record dim. Otherwise, we might violate the alignment of the blobs.
 
-    template<std::size_t... RecordCoords>
-    static constexpr auto isComputed(llama::RecordCoord<RecordCoords...>)
-    {
-        return true;
-    }
+        constexpr BitpackSoA() = default;
 
-    // FIXME: might violate alignment
-    using RegisterInt = std::uint64_t;
+        LLAMA_FN_HOST_ACC_INLINE
+        constexpr explicit BitpackSoA(unsigned bits, ArrayExtents extents, RecordDim = {})
+            : ArrayExtents(extents)
+            , bits{bits}
+        {
+        }
 
-    template<typename T, typename Pointer>
-    struct Reference
-    {
-        Pointer ptr;
-        std::size_t bitOffset;
-        unsigned bits;
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
+        {
+            return *this; // NOLINT(cppcoreguidelines-slicing)
+        }
 
-        static constexpr auto registerBits = sizeof(RegisterInt) * CHAR_BIT;
+        LLAMA_FN_HOST_ACC_INLINE
+        constexpr auto blobSize(std::size_t /*blobIndex*/) const -> std::size_t
+        {
+            constexpr auto bitsPerStoredIntegral = sizeof(StoredIntegral) * CHAR_BIT;
+            return (LinearizeArrayDimsFunctor{}.size(extents()) * bits + bitsPerStoredIntegral - 1)
+                / bitsPerStoredIntegral;
+        }
 
-        // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
-        operator T() const
+        template<std::size_t... RecordCoords>
+        static constexpr auto isComputed(llama::RecordCoord<RecordCoords...>)
         {
-            auto* p = ptr + bitOffset / registerBits;
-            const auto innerBitOffset = bitOffset % registerBits;
-            auto v = p[0] >> innerBitOffset;
-
-            const auto innerBitEndOffset = innerBitOffset + bits;
-            if(innerBitEndOffset <= registerBits)
-            {
-                const auto mask = (RegisterInt{1} << bits) - 1u;
-                v &= mask;
-            }
-            else
-            {
-                const auto excessBits = innerBitEndOffset - registerBits;
-                const auto bitsLoaded = registerBits - innerBitOffset;
-                const auto mask = (RegisterInt{1} << excessBits) - 1u;
-                v |= (p[1] & mask) << bitsLoaded;
-            }
-            if constexpr(std::is_signed_v<T>)
-                if((v & (RegisterInt{1} << (bits - 1))) != 0)
-                {
-                    // sign extend
-                    v |= static_cast<RegisterInt>(-1) << bits;
-                }
-            return static_cast<T>(v);
+            return true;
         }
 
-        auto operator=(T v) -> Reference&
+        template<std::size_t... RecordCoords, typename Blob>
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
+            ArrayIndex ai,
+            llama::RecordCoord<RecordCoords...>,
+            llama::Array<Blob, blobCount>& blobs) const
         {
-            const auto mask = (RegisterInt{1} << bits) - 1u;
-            const auto vBits = (static_cast<RegisterInt>(v) & mask);
-
-            auto* p = ptr + bitOffset / registerBits;
-            const auto innerBitOffset = bitOffset % registerBits;
-            const auto clearMask = ~(mask << innerBitOffset);
-            auto m = p[0] & clearMask; // clear previous bits
-            m |= vBits << innerBitOffset; // write new bits
-            p[0] = m;
-
-            const auto innerBitEndOffset = innerBitOffset + bits;
-            if(innerBitEndOffset > registerBits)
-            {
-                const auto excessBits = innerBitEndOffset - registerBits;
-                const auto bitsWritten = registerBits - innerBitOffset;
-                const auto clearMask = ~((RegisterInt{1} << excessBits) - 1u);
-                auto m = p[1] & clearMask; // clear previous bits
-                m |= vBits >> bitsWritten; // write new bits
-                p[1] = m;
-            }
-
-            return *this;
+            constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
+            const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;
+
+            using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
+            return internal::IntegralReference<DstType, StoredIntegral*>{
+                reinterpret_cast<StoredIntegral*>(&blobs[blob][0]),
+                bitOffset,
+                bits};
         }
-    };
 
-    template<std::size_t... RecordCoords, typename Blob>
-    LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
-        ArrayIndex ai,
-        llama::RecordCoord<RecordCoords...>,
-        llama::Array<Blob, blobCount>& blobs) const
-    {
-        constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
-        const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;
+        template<std::size_t... RecordCoords, typename Blob>
+        LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
+            ArrayIndex ai,
+            llama::RecordCoord<RecordCoords...>,
+            const llama::Array<Blob, blobCount>& blobs) const
+        {
+            constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
+            const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;
+
+            using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
+            return internal::IntegralReference<DstType, const StoredIntegral*>{
+                reinterpret_cast<const StoredIntegral*>(&blobs[blob][0]),
+                bitOffset,
+                bits};
+        }
 
-        using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
-        return Reference<DstType, RegisterInt*>{reinterpret_cast<RegisterInt*>(&blobs[blob][0]), bitOffset, bits};
-    }
+    private:
+        unsigned bits = 0;
+    };
+} // namespace mapping
 
-    template<std::size_t... RecordCoords, typename Blob>
-    LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
-        ArrayIndex ai,
-        llama::RecordCoord<RecordCoords...>,
-        const llama::Array<Blob, blobCount>& blobs) const
-    {
-        constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
-        const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;
-
-        using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
-        return Reference<DstType, const RegisterInt*>{
-            reinterpret_cast<const RegisterInt*>(&blobs[blob][0]),
-            bitOffset,
-            bits};
-    }
+// clang-format off
+namespace tag
+{
+    struct X{};
+    struct Y{};
+    struct Z{};
+} // namespace tag
 
-private:
-    unsigned bits = 0;
-};
+using Vector = llama::Record<
+    llama::Field<tag::X, std::uint16_t>,
+    llama::Field<tag::Y, std::int32_t>,
+    llama::Field<tag::Z, std::uint64_t>
+>;
+// clang-format on
 
 auto main() -> int
 {
     constexpr auto N = 128;
     constexpr auto bits = 7;
-    const auto mapping = BitpackSoA{bits, llama::ArrayExtents<llama::dyn>{N}, Vector{}};
+    const auto mapping = mapping::BitpackSoA{bits, llama::ArrayExtents<llama::dyn>{N}, Vector{}};
 
     auto view = llama::allocView(mapping);
 

diff --git a/examples/common/IntegralReference.hpp b/examples/common/IntegralReference.hpp
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <climits>
+#include <type_traits>
+
+namespace internal
+{
+    /// A proxy type representing a reference to an integral value, stored in a buffer at a specified bit offset.
+    /// @tparam Integral Integral data type which can be loaded and store through this reference.
+    /// @tparam StoredIntegralPointer Pointer to integral type used for storing the bits.
+    template<typename Integral, typename StoredIntegralPointer>
+    struct IntegralReference
+    {
+        using StoredIntegral = std::remove_const_t<std::remove_pointer_t<StoredIntegralPointer>>;
+
+        static_assert(std::is_integral_v<Integral>);
+        static_assert(std::is_integral_v<StoredIntegral>);
+        static_assert(
+            sizeof(StoredIntegral) >= sizeof(Integral),
+            "The integral type used for the storage must be at least as big as the type of the values to retrieve");
+
+        StoredIntegralPointer ptr;
+        std::size_t bitOffset;
+        unsigned bits;
+
+        static constexpr auto registerBits = sizeof(StoredIntegral) * CHAR_BIT;
+
+        // NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
+        operator Integral() const
+        {
+            auto* p = ptr + bitOffset / registerBits;
+            const auto innerBitOffset = bitOffset % registerBits;
+            auto v = p[0] >> innerBitOffset;
+
+            const auto innerBitEndOffset = innerBitOffset + bits;
+            if(innerBitEndOffset <= registerBits)
+            {
+                const auto mask = (StoredIntegral{1} << bits) - 1u;
+                v &= mask;
+            }
+            else
+            {
+                const auto excessBits = innerBitEndOffset - registerBits;
+                const auto bitsLoaded = registerBits - innerBitOffset;
+                const auto mask = (StoredIntegral{1} << excessBits) - 1u;
+                v |= (p[1] & mask) << bitsLoaded;
+            }
+            if constexpr(std::is_signed_v<Integral>)
+                if((v & (StoredIntegral{1} << (bits - 1))) != 0)
+                {
+                    // sign extend
+                    v |= static_cast<StoredIntegral>(-1) << bits;
+                }
+            return static_cast<Integral>(v);
+        }
+
+        auto operator=(Integral v) -> IntegralReference&
+        {
+            const auto mask = (StoredIntegral{1} << bits) - 1u;
+            const auto vBits = (static_cast<StoredIntegral>(v) & mask);
+
+            auto* p = ptr + bitOffset / registerBits;
+            const auto innerBitOffset = bitOffset % registerBits;
+            const auto clearMask = ~(mask << innerBitOffset);
+            auto m = p[0] & clearMask; // clear previous bits
+            m |= vBits << innerBitOffset; // write new bits
+            p[0] = m;
+
+            const auto innerBitEndOffset = innerBitOffset + bits;
+            if(innerBitEndOffset > registerBits)
+            {
+                const auto excessBits = innerBitEndOffset - registerBits;
+                const auto bitsWritten = registerBits - innerBitOffset;
+                const auto clearMask = ~((StoredIntegral{1} << excessBits) - 1u);
+                auto m = p[1] & clearMask; // clear previous bits
+                m |= vBits >> bitsWritten; // write new bits
+                p[1] = m;
+            }
+
+            return *this;
+        }
+    };
+} // namespace internal
diff --git a/examples/floatpack/CMakeLists.txt b/examples/floatpack/CMakeLists.txt
@@ -0,0 +1,9 @@
+cmake_minimum_required (VERSION 3.15)
+project(llama-floatpack CXX)
+
+if (NOT TARGET llama::llama)
+	find_package(llama REQUIRED)
+endif()
+add_executable(${PROJECT_NAME} floatpack.cpp)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
+target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)