diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 33e134d9cb..49d8ca5157 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -16,7 +16,7 @@ #include #include -constexpr auto MAPPING = 0; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto MAPPING = 0; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA constexpr auto USE_SHARED = true; ///< defines whether shared memory shall be used constexpr auto USE_SHARED_TREE = true; ///< defines whether the shared memory shall use tree mapping or ///< native mapping @@ -187,8 +187,10 @@ int main(int argc, char** argv) if constexpr (MAPPING == 1) return llama::mapping::SoA{arrayDomain, Particle{}}; if constexpr (MAPPING == 2) - return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Particle{}}; + return llama::mapping::SoA{arrayDomain, Particle{}, std::true_type{}}; if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Particle{}}; + if constexpr (MAPPING == 4) return llama::mapping::tree::Mapping{ arrayDomain, llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp index e0117c3ede..1495c36192 100644 --- a/examples/alpaka/vectoradd/vectoradd.cpp +++ b/examples/alpaka/vectoradd/vectoradd.cpp @@ -16,7 +16,8 @@ #include #include -constexpr auto MAPPING = 0; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto MAPPING + = 1; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs, does not work yet), 3 tree AoS, 4 tree SoA constexpr auto PROBLEM_SIZE = 64 * 1024 * 1024; constexpr auto BLOCK_SIZE = 256; constexpr auto STEPS = 10; @@ -86,8 +87,10 @@ int main(int argc, char** argv) if constexpr (MAPPING == 1) return llama::mapping::SoA{arrayDomain, Vector{}}; if constexpr (MAPPING == 2) - return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Vector{}}; + return llama::mapping::SoA{arrayDomain, Vector{}, std::true_type{}}; if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Vector{}}; + if constexpr (MAPPING == 4) return llama::mapping::tree::Mapping{ arrayDomain, llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, diff --git a/examples/nbody/CMakeLists.txt b/examples/nbody/CMakeLists.txt index 5cd47c23c8..6e926a39d8 100644 --- a/examples/nbody/CMakeLists.txt +++ b/examples/nbody/CMakeLists.txt @@ -10,8 +10,8 @@ target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options(${PROJECT_NAME} PRIVATE -fno-math-errno # sqrt prevents vectorization otherwise -# -march=native -# -ffast-math + -march=native + -ffast-math ) endif() diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp index f7efa2bce6..7be786131a 100644 --- a/examples/nbody/nbody.cpp +++ b/examples/nbody/nbody.cpp @@ -7,7 +7,7 @@ // needs -fno-math-errno, so std::sqrt() can be vectorized -constexpr auto MAPPING = 1; ///< 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto MAPPING = 2; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blos), 3 tree AoS, 4 tree SoA constexpr auto PROBLEM_SIZE = 16 * 1024; ///< total number of particles constexpr auto STEPS = 5; ///< number of steps to calculate constexpr auto TRACE = false; @@ -86,8 +86,10 @@ namespace usellama if constexpr (MAPPING == 1) return llama::mapping::SoA{arrayDomain, Particle{}}; if constexpr (MAPPING == 2) - return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Particle{}}; + return llama::mapping::SoA{arrayDomain, Particle{}, std::true_type{}}; if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Particle{}}; + if constexpr (MAPPING == 4) return llama::mapping::tree::Mapping{ arrayDomain, llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, diff --git a/examples/vectoradd/vectoradd.cpp b/examples/vectoradd/vectoradd.cpp index ee64f8f146..f9c23ba23f 100644 --- a/examples/vectoradd/vectoradd.cpp +++ b/examples/vectoradd/vectoradd.cpp @@ -3,7 +3,7 @@ #include #include -constexpr auto MAPPING = 3; /// 0 native AoS, 1 native SoA, 2 tree AoS, 3 tree SoA +constexpr auto MAPPING = 2; ///< 0 native AoS, 1 native SoA, 2 native SoA (separate blobs), 3 tree AoS, 4 tree SoA constexpr auto PROBLEM_SIZE = 64 * 1024 * 1024; ///< problem size constexpr auto STEPS = 10; ///< number of vector adds to perform @@ -48,8 +48,10 @@ namespace usellama if constexpr (MAPPING == 1) return llama::mapping::SoA{arrayDomain, Vector{}}; if constexpr (MAPPING == 2) - return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Vector{}}; + return llama::mapping::SoA{arrayDomain, Vector{}, std::true_type{}}; if constexpr (MAPPING == 3) + return llama::mapping::tree::Mapping{arrayDomain, llama::Tuple{}, Vector{}}; + if constexpr (MAPPING == 4) return llama::mapping::tree::Mapping{ arrayDomain, llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, diff --git a/include/llama/Array.hpp b/include/llama/Array.hpp index 5d57cb9529..1c05aab7dd 100644 --- a/include/llama/Array.hpp +++ b/include/llama/Array.hpp @@ -19,28 +19,28 @@ namespace llama static constexpr std::size_t rank = N; T element[N > 0 ? N : 1]; - LLAMA_FN_HOST_ACC_INLINE T* begin() + LLAMA_FN_HOST_ACC_INLINE constexpr T* begin() { return &element[0]; } - LLAMA_FN_HOST_ACC_INLINE const T* begin() const + LLAMA_FN_HOST_ACC_INLINE constexpr const T* begin() const { return &element[0]; } - LLAMA_FN_HOST_ACC_INLINE T* end() + LLAMA_FN_HOST_ACC_INLINE constexpr T* end() { return &element[N]; }; - LLAMA_FN_HOST_ACC_INLINE const T* end() const + LLAMA_FN_HOST_ACC_INLINE constexpr const T* end() const { return &element[N]; }; template - LLAMA_FN_HOST_ACC_INLINE auto operator[](IndexType&& idx) -> T& + LLAMA_FN_HOST_ACC_INLINE constexpr auto operator[](IndexType&& idx) -> T& { return element[idx]; } @@ -51,7 +51,7 @@ namespace llama return element[idx]; } - LLAMA_FN_HOST_ACC_INLINE friend auto operator==(const Array& a, const Array& b) -> bool + LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator==(const Array& a, const Array& b) -> bool { for (std::size_t i = 0; i < N; ++i) if (a.element[i] != b.element[i]) @@ -59,7 +59,7 @@ namespace llama return true; } - LLAMA_FN_HOST_ACC_INLINE friend auto operator+(const Array& a, const Array& b) -> Array + LLAMA_FN_HOST_ACC_INLINE constexpr friend auto operator+(const Array& a, const Array& b) -> Array { Array temp; for (std::size_t i = 0; i < N; ++i) @@ -68,13 +68,13 @@ namespace llama } template - auto get() -> T& + constexpr auto get() -> T& { return element[I]; } template - auto get() const -> const T& + constexpr auto get() const -> const T& { return element[I]; } diff --git a/include/llama/DumpMapping.hpp b/include/llama/DumpMapping.hpp index 665a9d5b9a..1626e0f649 100644 --- a/include/llama/DumpMapping.hpp +++ b/include/llama/DumpMapping.hpp @@ -60,9 +60,9 @@ namespace llama } template - auto mappingOffset(const Mapping& mapping, const ArrayDomain& udCoord, DatumCoord) + auto mappingBlobNrAndOffset(const Mapping& mapping, const ArrayDomain& udCoord, DatumCoord) { - return mapping.template getBlobNrAndOffset(udCoord).offset; + return mapping.template getBlobNrAndOffset(udCoord); } } // namespace internal @@ -82,7 +82,7 @@ namespace llama ArrayDomain udCoord; std::vector ddIndices; std::vector ddTags; - std::size_t offset; + NrAndOffset nrAndOffset; std::size_t size; }; std::vector infos; @@ -95,7 +95,7 @@ namespace llama udCoord, internal::toVec(coord), internal::tagsAsStrings(coord), - internal::mappingOffset(mapping, udCoord, coord), + internal::mappingBlobNrAndOffset(mapping, udCoord, coord), size}); }); } @@ -140,8 +140,16 @@ namespace llama for (const auto& info : infos) { - const auto x = (info.offset % wrapByteCount) * byteSizeInPixel; - const auto y = (info.offset / wrapByteCount) * byteSizeInPixel; + std::size_t blobY = 0; + for (auto i = 0; i < info.nrAndOffset.nr; i++) + { + auto blobRows = (mapping.getBlobSize(i) + wrapByteCount - 1) / wrapByteCount; + blobRows++; // one row gap between blobs + blobY += blobRows * byteSizeInPixel; + } + + const auto x = (info.nrAndOffset.offset % wrapByteCount) * byteSizeInPixel; + const auto y = (info.nrAndOffset.offset / wrapByteCount) * byteSizeInPixel + blobY; const auto fill = boost::hash_value(info.ddIndices) & 0xFFFFFF; @@ -193,7 +201,7 @@ namespace llama ArrayDomain udCoord; std::vector ddIndices; std::vector ddTags; - std::size_t offset; + NrAndOffset nrAndOffset; std::size_t size; }; std::vector infos; @@ -206,11 +214,13 @@ namespace llama udCoord, internal::toVec(coord), internal::tagsAsStrings(coord), - internal::mappingOffset(mapping, udCoord, coord), + internal::mappingBlobNrAndOffset(mapping, udCoord, coord), size}); }); } - std::sort(begin(infos), end(infos), [](const DatumInfo& a, const DatumInfo& b) { return a.offset < b.offset; }); + std::sort(begin(infos), end(infos), [](const DatumInfo& a, const DatumInfo& b) { + return std::tie(a.nrAndOffset.nr, a.nrAndOffset.offset) < std::tie(b.nrAndOffset.nr, b.nrAndOffset.offset); + }); auto formatDDTags = [](const std::vector& tags) { std::string s; @@ -308,8 +318,14 @@ namespace llama )"); + auto currentBlobNr = std::numeric_limits::max(); for (const auto& info : infos) { + if (currentBlobNr != info.nrAndOffset.nr) + { + currentBlobNr = info.nrAndOffset.nr; + svg += fmt::format("

Blob: {}

", currentBlobNr); + } const auto width = byteSizeInPixel * info.size; svg += fmt::format( R"(
{1} {2}
)", diff --git a/include/llama/mapping/SoA.hpp b/include/llama/mapping/SoA.hpp index 5b14abb93f..8b96019d5b 100644 --- a/include/llama/mapping/SoA.hpp +++ b/include/llama/mapping/SoA.hpp @@ -16,34 +16,88 @@ namespace llama::mapping template < typename T_ArrayDomain, typename T_DatumDomain, + typename SeparateBuffers = std::false_type, // TODO: make this a bool. Needs work in SplitMapping typename LinearizeArrayDomainFunctor = LinearizeArrayDomainCpp> struct SoA { using ArrayDomain = T_ArrayDomain; using DatumDomain = T_DatumDomain; - static constexpr std::size_t blobCount = 1; + static constexpr std::size_t blobCount = []() constexpr + { + if constexpr (SeparateBuffers::value) + { + std::size_t count = 0; + forEach([&](auto) constexpr { count++; }); + return count; + } + else + return 1; + } + (); SoA() = default; LLAMA_FN_HOST_ACC_INLINE - SoA(ArrayDomain size, DatumDomain = {}) : arrayDomainSize(size) + SoA(ArrayDomain size, DatumDomain = {}, SeparateBuffers = {}) : arrayDomainSize(size) { } LLAMA_FN_HOST_ACC_INLINE - auto getBlobSize(std::size_t) const -> std::size_t + auto getBlobSize(std::size_t blobIndex) const -> std::size_t { - return LinearizeArrayDomainFunctor{}.size(arrayDomainSize) * sizeOf; + if constexpr (SeparateBuffers::value) + { + static constexpr llama::Array typeSizes = []() constexpr + { + llama::Array r{}; + std::size_t i = 0; + forEach([&](auto coord) constexpr { + r[i++] = sizeof(GetType); + }); + return r; + } + (); + return LinearizeArrayDomainFunctor{}.size(arrayDomainSize) * typeSizes[blobIndex]; + } + else + return LinearizeArrayDomainFunctor{}.size(arrayDomainSize) * sizeOf; } template LLAMA_FN_HOST_ACC_INLINE auto getBlobNrAndOffset(ArrayDomain coord) const -> NrAndOffset { - LLAMA_FORCE_INLINE_RECURSIVE - const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize) - * sizeof(GetType>) - + offsetOf * LinearizeArrayDomainFunctor{}.size(arrayDomainSize); - return {0, offset}; + if constexpr (SeparateBuffers::value) + { + using TargetDatumCoord = DatumCoord; + constexpr auto blob = [&]() constexpr + { + std::size_t index = 0; + bool found = false; + forEach([&](auto c) constexpr { + if constexpr (std::is_same_v) + found = true; + else if (!found) + index++; + }); + if (!found) + throw "Passed TargetDatumCoord must be in datum domain"; + return index; + } + (); + + LLAMA_FORCE_INLINE_RECURSIVE + const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize) + * sizeof(GetType>); + return {blob, offset}; + } + else + { + LLAMA_FORCE_INLINE_RECURSIVE + const auto offset = LinearizeArrayDomainFunctor{}(coord, arrayDomainSize) + * sizeof(GetType>) + + offsetOf * LinearizeArrayDomainFunctor{}.size(arrayDomainSize); + return {0, offset}; + } } ArrayDomain arrayDomainSize; diff --git a/include/llama/mapping/SplitMapping.hpp b/include/llama/mapping/SplitMapping.hpp index 881056ea19..cc42acfb71 100644 --- a/include/llama/mapping/SplitMapping.hpp +++ b/include/llama/mapping/SplitMapping.hpp @@ -77,7 +77,7 @@ namespace llama::mapping LLAMA_FN_HOST_ACC_INLINE auto getBlobSize(std::size_t) const -> std::size_t { - return mapping1BlobSize + mapping2.getBlobSize(); + return mapping1BlobSize + mapping2.getBlobSize(0); } template diff --git a/tests/dump.cpp b/tests/dump.cpp index f92278cbed..1da03c3dfc 100644 --- a/tests/dump.cpp +++ b/tests/dump.cpp @@ -81,6 +81,11 @@ TEST_CASE("dump.SoA") dump(llama::mapping::SoA{arrayDomain, Particle{}}, "SoAMapping"); } +TEST_CASE("dump.SoA.MultiBlob") +{ + dump(llama::mapping::SoA{arrayDomain, Particle{}, std::true_type{}}, "SoAMappingMultiBlob"); +} + TEST_CASE("dump.AoSoA.8") { dump(llama::mapping::AoSoA{arrayDomain}, "AoSoAMapping8"); @@ -91,7 +96,6 @@ TEST_CASE("dump.AoSoA.32") dump(llama::mapping::AoSoA{arrayDomain}, "AoSoAMapping32"); } - TEST_CASE("dump.SplitMapping") { dump( diff --git a/tests/mapping.cpp b/tests/mapping.cpp index 657693e82c..aa515df991 100644 --- a/tests/mapping.cpp +++ b/tests/mapping.cpp @@ -1,8 +1,8 @@ #include "common.h" #include -#include #include +#include // clang-format off namespace tag { @@ -254,7 +254,9 @@ TEST_CASE("address.SoA.fortran") { using ArrayDomain = llama::ArrayDomain<2>; auto arrayDomain = ArrayDomain{16, 16}; - auto mapping = llama::mapping::SoA{arrayDomain}; + auto mapping + = llama::mapping::SoA{ + arrayDomain}; { const auto coord = ArrayDomain{0, 0}; @@ -310,7 +312,9 @@ TEST_CASE("address.SoA.morton") using ArrayDomain = llama::ArrayDomain<2>; auto arrayDomain = ArrayDomain{16, 16}; - auto mapping = llama::mapping::SoA{arrayDomain}; + auto mapping + = llama::mapping::SoA{ + arrayDomain}; { const auto coord = ArrayDomain{0, 0}; @@ -358,6 +362,58 @@ TEST_CASE("address.SoA.morton") } } +TEST_CASE("address.SoA.MultiBlob") +{ + using ArrayDomain = llama::ArrayDomain<2>; + auto arrayDomain = ArrayDomain{16, 16}; + auto mapping = llama::mapping::SoA{arrayDomain}; + + { + const auto coord = ArrayDomain{0, 0}; + CHECK(mapping.getBlobNrAndOffset<0, 0>(coord) == llama::NrAndOffset{0, 0}); + CHECK(mapping.getBlobNrAndOffset<0, 1>(coord) == llama::NrAndOffset{1, 0}); + CHECK(mapping.getBlobNrAndOffset<0, 2>(coord) == llama::NrAndOffset{2, 0}); + CHECK(mapping.getBlobNrAndOffset<1>(coord ) == llama::NrAndOffset{3, 0}); + CHECK(mapping.getBlobNrAndOffset<2, 0>(coord) == llama::NrAndOffset{4, 0}); + CHECK(mapping.getBlobNrAndOffset<2, 1>(coord) == llama::NrAndOffset{5, 0}); + CHECK(mapping.getBlobNrAndOffset<2, 2>(coord) == llama::NrAndOffset{6, 0}); + CHECK(mapping.getBlobNrAndOffset<3, 0>(coord) == llama::NrAndOffset{7, 0}); + CHECK(mapping.getBlobNrAndOffset<3, 1>(coord) == llama::NrAndOffset{8, 0}); + CHECK(mapping.getBlobNrAndOffset<3, 2>(coord) == llama::NrAndOffset{9, 0}); + CHECK(mapping.getBlobNrAndOffset<3, 3>(coord) == llama::NrAndOffset{10, 0}); + } + + { + const auto coord = ArrayDomain{0, 1}; + CHECK(mapping.getBlobNrAndOffset<0, 0>(coord) == llama::NrAndOffset{0, 8}); + CHECK(mapping.getBlobNrAndOffset<0, 1>(coord) == llama::NrAndOffset{1, 8}); + CHECK(mapping.getBlobNrAndOffset<0, 2>(coord) == llama::NrAndOffset{2, 8}); + CHECK(mapping.getBlobNrAndOffset<1>(coord) == llama::NrAndOffset{3, 4}); + CHECK(mapping.getBlobNrAndOffset<2, 0>(coord) == llama::NrAndOffset{4, 8}); + CHECK(mapping.getBlobNrAndOffset<2, 1>(coord) == llama::NrAndOffset{5, 8}); + CHECK(mapping.getBlobNrAndOffset<2, 2>(coord) == llama::NrAndOffset{6, 8}); + CHECK(mapping.getBlobNrAndOffset<3, 0>(coord) == llama::NrAndOffset{7, 1}); + CHECK(mapping.getBlobNrAndOffset<3, 1>(coord) == llama::NrAndOffset{8, 1}); + CHECK(mapping.getBlobNrAndOffset<3, 2>(coord) == llama::NrAndOffset{9, 1}); + CHECK(mapping.getBlobNrAndOffset<3, 3>(coord) == llama::NrAndOffset{10, 1}); + } + + { + const auto coord = ArrayDomain{1, 0}; + CHECK(mapping.getBlobNrAndOffset<0, 0>(coord) == llama::NrAndOffset{0, 128}); + CHECK(mapping.getBlobNrAndOffset<0, 1>(coord) == llama::NrAndOffset{1, 128}); + CHECK(mapping.getBlobNrAndOffset<0, 2>(coord) == llama::NrAndOffset{2, 128}); + CHECK(mapping.getBlobNrAndOffset<1>(coord) == llama::NrAndOffset{3, 64}); + CHECK(mapping.getBlobNrAndOffset<2, 0>(coord) == llama::NrAndOffset{4, 128}); + CHECK(mapping.getBlobNrAndOffset<2, 1>(coord) == llama::NrAndOffset{5, 128}); + CHECK(mapping.getBlobNrAndOffset<2, 2>(coord) == llama::NrAndOffset{6, 128}); + CHECK(mapping.getBlobNrAndOffset<3, 0>(coord) == llama::NrAndOffset{7, 16}); + CHECK(mapping.getBlobNrAndOffset<3, 1>(coord) == llama::NrAndOffset{8, 16}); + CHECK(mapping.getBlobNrAndOffset<3, 2>(coord) == llama::NrAndOffset{9, 16}); + CHECK(mapping.getBlobNrAndOffset<3, 3>(coord) == llama::NrAndOffset{10, 16}); + } +} + TEST_CASE("address.AoSoA.4") { using ArrayDomain = llama::ArrayDomain<2>;