Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new example demonstrating bitpacking of floats #414

Merged
merged 1 commit into from
Nov 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory("examples/raycast")
add_subdirectory("examples/bitpack")
add_subdirectory("examples/bytesplit")
add_subdirectory("examples/floatpack")

# alpaka examples
find_package(alpaka 0.7.0 QUIET)
Expand Down
218 changes: 82 additions & 136 deletions examples/bitpack/bitpack.cpp
Original file line number Diff line number Diff line change
@@ -1,166 +1,112 @@
#include "../common/IntegralReference.hpp"

#include <cstdint>
#include <fmt/core.h>
#include <llama/llama.hpp>

// clang-format off
namespace tag
namespace mapping
{
struct X{};
struct Y{};
struct Z{};
} // namespace tag

using Vector = llama::Record<
llama::Field<tag::X, std::uint16_t>,
llama::Field<tag::Y, std::int32_t>,
llama::Field<tag::Z, std::uint64_t>
>;
// clang-format on

template<
typename TArrayExtents,
typename TRecordDim,
typename LinearizeArrayDimsFunctor = llama::mapping::LinearizeArrayDimsCpp>
struct BitpackSoA : TArrayExtents
{
using ArrayExtents = TArrayExtents;
using ArrayIndex = typename ArrayExtents::Index;
using RecordDim = TRecordDim;

static constexpr std::size_t blobCount = boost::mp11::mp_size<llama::FlatRecordDim<RecordDim>>::value;

constexpr BitpackSoA() = default;

LLAMA_FN_HOST_ACC_INLINE
constexpr explicit BitpackSoA(unsigned bits, ArrayExtents extents, RecordDim = {})
: ArrayExtents(extents)
, bits{bits}
template<
typename TArrayExtents,
typename TRecordDim,
typename LinearizeArrayDimsFunctor = llama::mapping::LinearizeArrayDimsCpp>
struct BitpackSoA : TArrayExtents
{
}
using ArrayExtents = TArrayExtents;
using ArrayIndex = typename ArrayExtents::Index;
using RecordDim = TRecordDim;

LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
{
return *this; // NOLINT(cppcoreguidelines-slicing)
}
static constexpr std::size_t blobCount = boost::mp11::mp_size<llama::FlatRecordDim<RecordDim>>::value;

LLAMA_FN_HOST_ACC_INLINE
constexpr auto blobSize(std::size_t /*blobIndex*/) const -> std::size_t
{
return (LinearizeArrayDimsFunctor{}.size(extents()) * bits + CHAR_BIT - 1) / CHAR_BIT;
}
using StoredIntegral
= std::uint64_t; // TODO(bgruber): we should choose an integral type which is as large as the
// largest type in the record dim. Otherwise, we might violate the alignment of the blobs.

template<std::size_t... RecordCoords>
static constexpr auto isComputed(llama::RecordCoord<RecordCoords...>)
{
return true;
}
constexpr BitpackSoA() = default;

// FIXME: might violate alignment
using RegisterInt = std::uint64_t;
LLAMA_FN_HOST_ACC_INLINE
constexpr explicit BitpackSoA(unsigned bits, ArrayExtents extents, RecordDim = {})
: ArrayExtents(extents)
, bits{bits}
{
}

template<typename T, typename Pointer>
struct Reference
{
Pointer ptr;
std::size_t bitOffset;
unsigned bits;
LLAMA_FN_HOST_ACC_INLINE constexpr auto extents() const -> ArrayExtents
{
return *this; // NOLINT(cppcoreguidelines-slicing)
}

static constexpr auto registerBits = sizeof(RegisterInt) * CHAR_BIT;
LLAMA_FN_HOST_ACC_INLINE
constexpr auto blobSize(std::size_t /*blobIndex*/) const -> std::size_t
{
constexpr auto bitsPerStoredIntegral = sizeof(StoredIntegral) * CHAR_BIT;
return (LinearizeArrayDimsFunctor{}.size(extents()) * bits + bitsPerStoredIntegral - 1)
/ bitsPerStoredIntegral;
}

// NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
operator T() const
template<std::size_t... RecordCoords>
static constexpr auto isComputed(llama::RecordCoord<RecordCoords...>)
{
auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
auto v = p[0] >> innerBitOffset;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset <= registerBits)
{
const auto mask = (RegisterInt{1} << bits) - 1u;
v &= mask;
}
else
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsLoaded = registerBits - innerBitOffset;
const auto mask = (RegisterInt{1} << excessBits) - 1u;
v |= (p[1] & mask) << bitsLoaded;
}
if constexpr(std::is_signed_v<T>)
if((v & (RegisterInt{1} << (bits - 1))) != 0)
{
// sign extend
v |= static_cast<RegisterInt>(-1) << bits;
}
return static_cast<T>(v);
return true;
}

auto operator=(T v) -> Reference&
template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
llama::Array<Blob, blobCount>& blobs) const
{
const auto mask = (RegisterInt{1} << bits) - 1u;
const auto vBits = (static_cast<RegisterInt>(v) & mask);

auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
const auto clearMask = ~(mask << innerBitOffset);
auto m = p[0] & clearMask; // clear previous bits
m |= vBits << innerBitOffset; // write new bits
p[0] = m;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset > registerBits)
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsWritten = registerBits - innerBitOffset;
const auto clearMask = ~((RegisterInt{1} << excessBits) - 1u);
auto m = p[1] & clearMask; // clear previous bits
m |= vBits >> bitsWritten; // write new bits
p[1] = m;
}

return *this;
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return internal::IntegralReference<DstType, StoredIntegral*>{
reinterpret_cast<StoredIntegral*>(&blobs[blob][0]),
bitOffset,
bits};
}
};

template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
llama::Array<Blob, blobCount>& blobs) const
{
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;
template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
const llama::Array<Blob, blobCount>& blobs) const
{
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return internal::IntegralReference<DstType, const StoredIntegral*>{
reinterpret_cast<const StoredIntegral*>(&blobs[blob][0]),
bitOffset,
bits};
}

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return Reference<DstType, RegisterInt*>{reinterpret_cast<RegisterInt*>(&blobs[blob][0]), bitOffset, bits};
}
private:
unsigned bits = 0;
};
} // namespace mapping

template<std::size_t... RecordCoords, typename Blob>
LLAMA_FN_HOST_ACC_INLINE constexpr auto compute(
ArrayIndex ai,
llama::RecordCoord<RecordCoords...>,
const llama::Array<Blob, blobCount>& blobs) const
{
constexpr auto blob = llama::flatRecordCoord<RecordDim, llama::RecordCoord<RecordCoords...>>;
const auto bitOffset = LinearizeArrayDimsFunctor{}(ai, extents()) * bits;

using DstType = llama::GetType<RecordDim, llama::RecordCoord<RecordCoords...>>;
return Reference<DstType, const RegisterInt*>{
reinterpret_cast<const RegisterInt*>(&blobs[blob][0]),
bitOffset,
bits};
}
// clang-format off
namespace tag
{
struct X{};
struct Y{};
struct Z{};
} // namespace tag

private:
unsigned bits = 0;
};
using Vector = llama::Record<
llama::Field<tag::X, std::uint16_t>,
llama::Field<tag::Y, std::int32_t>,
llama::Field<tag::Z, std::uint64_t>
>;
// clang-format on

auto main() -> int
{
constexpr auto N = 128;
constexpr auto bits = 7;
const auto mapping = BitpackSoA{bits, llama::ArrayExtents<llama::dyn>{N}, Vector{}};
const auto mapping = mapping::BitpackSoA{bits, llama::ArrayExtents<llama::dyn>{N}, Vector{}};

auto view = llama::allocView(mapping);

Expand Down
83 changes: 83 additions & 0 deletions examples/common/IntegralReference.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#pragma once

#include <climits>
#include <type_traits>

namespace internal
{
/// A proxy type representing a reference to an integral value, stored in a buffer at a specified bit offset.
/// @tparam Integral Integral data type which can be loaded and store through this reference.
/// @tparam StoredIntegralPointer Pointer to integral type used for storing the bits.
template<typename Integral, typename StoredIntegralPointer>
struct IntegralReference
{
using StoredIntegral = std::remove_const_t<std::remove_pointer_t<StoredIntegralPointer>>;

static_assert(std::is_integral_v<Integral>);
static_assert(std::is_integral_v<StoredIntegral>);
static_assert(
sizeof(StoredIntegral) >= sizeof(Integral),
"The integral type used for the storage must be at least as big as the type of the values to retrieve");

StoredIntegralPointer ptr;
std::size_t bitOffset;
unsigned bits;

static constexpr auto registerBits = sizeof(StoredIntegral) * CHAR_BIT;

// NOLINTNEXTLINE(google-explicit-constructor,hicpp-explicit-conversions)
operator Integral() const
{
auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
auto v = p[0] >> innerBitOffset;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset <= registerBits)
{
const auto mask = (StoredIntegral{1} << bits) - 1u;
v &= mask;
}
else
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsLoaded = registerBits - innerBitOffset;
const auto mask = (StoredIntegral{1} << excessBits) - 1u;
v |= (p[1] & mask) << bitsLoaded;
}
if constexpr(std::is_signed_v<Integral>)
if((v & (StoredIntegral{1} << (bits - 1))) != 0)
{
// sign extend
v |= static_cast<StoredIntegral>(-1) << bits;
}
return static_cast<Integral>(v);
}

auto operator=(Integral v) -> IntegralReference&
{
const auto mask = (StoredIntegral{1} << bits) - 1u;
const auto vBits = (static_cast<StoredIntegral>(v) & mask);

auto* p = ptr + bitOffset / registerBits;
const auto innerBitOffset = bitOffset % registerBits;
const auto clearMask = ~(mask << innerBitOffset);
auto m = p[0] & clearMask; // clear previous bits
m |= vBits << innerBitOffset; // write new bits
p[0] = m;

const auto innerBitEndOffset = innerBitOffset + bits;
if(innerBitEndOffset > registerBits)
{
const auto excessBits = innerBitEndOffset - registerBits;
const auto bitsWritten = registerBits - innerBitOffset;
const auto clearMask = ~((StoredIntegral{1} << excessBits) - 1u);
auto m = p[1] & clearMask; // clear previous bits
m |= vBits >> bitsWritten; // write new bits
p[1] = m;
}

return *this;
}
};
} // namespace internal
9 changes: 9 additions & 0 deletions examples/floatpack/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
cmake_minimum_required (VERSION 3.15)
project(llama-floatpack CXX)

if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()
add_executable(${PROJECT_NAME} floatpack.cpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama)
Loading