support extents with arbitrary value types

alpaka-group · May 9, 2022 · c0c49c1 · c0c49c1
1 parent 55f841c
commit c0c49c1
Show file tree

Hide file tree

Showing 58 changed files with 666 additions and 444 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -41,7 +41,7 @@ if (BUILD_TESTING)
 		target_compile_options(tests PRIVATE /permissive- /constexpr:steps10000000 /diagnostics:caret)
 	else()
 		target_compile_features(tests PRIVATE cxx_std_20)
-		target_compile_options(tests PRIVATE -Wall -Wextra -Wno-missing-braces)
+		target_compile_options(tests PRIVATE -Wall -Wextra -Werror=narrowing -Wno-missing-braces)
 	endif()
 	if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM")
 		target_compile_options(tests PRIVATE -fconstexpr-steps=10000000)

diff --git a/docs/pages/api.rst b/docs/pages/api.rst
@@ -45,7 +45,7 @@ Array dimensions
 
 .. doxygenstruct:: llama::ArrayExtents
 .. doxygentypedef:: llama::ArrayExtentsDynamic
-.. doxygentypedef:: llama::ArrayExtentsStatic
+.. doxygentypedef:: llama::ArrayExtentsNCube
 .. doxygenstruct:: llama::ArrayIndex
 
 .. doxygenstruct:: llama::ArrayIndexIterator

diff --git a/examples/alpaka/asyncblur/asyncblur.cpp b/examples/alpaka/asyncblur/asyncblur.cpp
@@ -85,8 +85,10 @@ struct BlurKernel
             {
                 // Using SoA for the shared memory
                 constexpr auto sharedChunkSize = ElemsPerBlock + 2 * KernelSize;
-                constexpr auto sharedMapping = llama::mapping::
-                    SoA<llama::ArrayExtents<sharedChunkSize, sharedChunkSize>, typename View::RecordDim, false>{};
+                constexpr auto sharedMapping = llama::mapping::SoA<
+                    llama::ArrayExtents<std::size_t, sharedChunkSize, sharedChunkSize>,
+                    typename View::RecordDim,
+                    false>{};
                 auto& sharedMem = alpaka::declareSharedVar<std::byte[sharedMapping.blobSize(0)], __COUNTER__>(acc);
                 return llama::View(sharedMapping, llama::Array<std::byte*, 1>{&sharedMem[0]});
             }
@@ -211,7 +213,7 @@ try
     const auto hostMapping
         = llama::mapping::tree::Mapping{llama::ArrayExtents{buffer_y, buffer_x}, treeOperationList, Pixel{}};
     const auto devMapping = llama::mapping::tree::Mapping{
-        llama::ArrayExtents<CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
+        llama::ArrayExtents<std::size_t, CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
         treeOperationList,
         PixelOnAcc{}};
     using DevMapping = std::decay_t<decltype(devMapping)>;

diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp
@@ -102,7 +102,7 @@ LLAMA_FN_HOST_ACC_INLINE auto store(FP& dst, Vec v)
         v.store(&dst);
 }
 
-template<std::size_t Elems>
+template<int Elems>
 struct VecType
 {
     // TODO(bgruber): we need a vector type that also works on GPUs
@@ -116,7 +116,7 @@ struct VecType<1>
     using type = FP;
 };
 
-template<std::size_t Elems, typename ViewParticleI, typename VirtualParticleJ>
+template<int Elems, typename ViewParticleI, typename VirtualParticleJ>
 LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, VirtualParticleJ pj)
 {
     using Vec = typename VecType<Elems>::type;
@@ -143,7 +143,7 @@ LLAMA_FN_HOST_ACC_INLINE void pPInteraction(ViewParticleI pi, VirtualParticleJ p
     store<Vec>(pi(tag::Vel{}, tag::Z{}), zdistanceSqr * sts + load<Vec>(pi(tag::Vel{}, tag::Z{})));
 }
 
-template<std::size_t ProblemSize, std::size_t Elems, std::size_t BlockSize, Mapping MappingSM>
+template<int ProblemSize, int Elems, int BlockSize, Mapping MappingSM>
 struct UpdateKernel
 {
     template<typename Acc, typename View>
@@ -158,7 +158,7 @@ struct UpdateKernel
             {
                 constexpr auto sharedMapping = []
                 {
-                    using ArrayExtents = llama::ArrayExtents<BlockSize>;
+                    using ArrayExtents = llama::ArrayExtents<int, BlockSize>;
                     if constexpr(MappingSM == AoS)
                         return llama::mapping::AoS<ArrayExtents, Particle>{};
                     if constexpr(MappingSM == SoA)
@@ -181,7 +181,7 @@ struct UpdateKernel
         auto pi = [&]
         {
             constexpr auto mapping
-                = llama::mapping::SoA<llama::ArrayExtents<Elems>, typename View::RecordDim, false>{};
+                = llama::mapping::SoA<llama::ArrayExtents<int, Elems>, typename View::RecordDim, false>{};
             return llama::allocViewUninitialized(mapping, llama::bloballoc::Stack<mapping.blobSize(0)>{});
         }();
         // TODO(bgruber): vector load
@@ -190,26 +190,26 @@ struct UpdateKernel
             pi(e) = particles(ti * Elems + e);
 
         LLAMA_INDEPENDENT_DATA
-        for(std::size_t blockOffset = 0; blockOffset < ProblemSize; blockOffset += BlockSize)
+        for(int blockOffset = 0; blockOffset < ProblemSize; blockOffset += BlockSize)
         {
             LLAMA_INDEPENDENT_DATA
-            for(auto j = tbi; j < BlockSize; j += THREADS_PER_BLOCK)
+            for(int j = tbi; j < BlockSize; j += THREADS_PER_BLOCK)
                 sharedView(j) = particles(blockOffset + j);
             alpaka::syncBlockThreads(acc);
 
             LLAMA_INDEPENDENT_DATA
-            for(auto j = std::size_t{0}; j < BlockSize; ++j)
+            for(int j = 0; j < BlockSize; ++j)
                 pPInteraction<Elems>(pi(0u), sharedView(j));
             alpaka::syncBlockThreads(acc);
         }
         // TODO(bgruber): vector store
         LLAMA_INDEPENDENT_DATA
-        for(auto e = 0u; e < Elems; e++)
+        for(int e = 0u; e < Elems; e++)
             particles(ti * Elems + e) = pi(e);
     }
 };
 
-template<std::size_t ProblemSize, std::size_t Elems>
+template<int ProblemSize, int Elems>
 struct MoveKernel
 {
     template<typename Acc, typename View>
@@ -235,7 +235,7 @@ template<template<typename, typename> typename AccTemplate, Mapping MappingGM, M
 void run(std::ostream& plotFile)
 {
     using Dim = alpaka::DimInt<1>;
-    using Size = std::size_t;
+    using Size = int;
     using Acc = AccTemplate<Dim, Size>;
     using DevHost = alpaka::DevCpu;
     using DevAcc = alpaka::Dev<Acc>;
@@ -262,7 +262,7 @@ void run(std::ostream& plotFile)
 
     auto mapping = []
     {
-        using ArrayExtents = llama::ArrayExtents<llama::dyn>;
+        using ArrayExtents = llama::ArrayExtentsDynamic<1, int>;
         const auto extents = ArrayExtents{PROBLEM_SIZE};
         if constexpr(MappingGM == AoS)
             return llama::mapping::AoS<ArrayExtents, Particle>{extents};
@@ -290,7 +290,7 @@ void run(std::ostream& plotFile)
 
     std::mt19937_64 generator;
     std::normal_distribution<FP> distribution(FP(0), FP(1));
-    for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
+    for(int i = 0; i < PROBLEM_SIZE; ++i)
     {
         llama::One<Particle> p;
         p(tag::Pos(), tag::X()) = distribution(generator);
@@ -315,7 +315,7 @@ void run(std::ostream& plotFile)
 
     double sumUpdate = 0;
     double sumMove = 0;
-    for(std::size_t s = 0; s < STEPS; ++s)
+    for(int s = 0; s < STEPS; ++s)
     {
         auto updateKernel = UpdateKernel<PROBLEM_SIZE, DESIRED_ELEMENTS_PER_THREAD, THREADS_PER_BLOCK, MappingSM>{};
         alpaka::exec<Acc>(queue, workdiv, updateKernel, accView);

diff --git a/examples/alpaka/pic/pic.cpp b/examples/alpaka/pic/pic.cpp
@@ -309,7 +309,7 @@ auto setup(Queue& queue, const Dev& dev, const DevHost& devHost)
 
     auto particleMapping = [&]
     {
-        using ArrayExtents = llama::ArrayExtents<llama::dyn>;
+        using ArrayExtents = llama::ArrayExtentsDynamic<1>;
         const auto particleExtents = ArrayExtents{numpart};
         if constexpr(ParticleMapping == 0)
             return llama::mapping::AoS<ArrayExtents, Particle>{particleExtents};

diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp
@@ -82,7 +82,7 @@ try
     // LLAMA
     const auto mapping = [&]
     {
-        using ArrayExtents = llama::ArrayExtents<llama::dyn>;
+        using ArrayExtents = llama::ArrayExtentsDynamic<1>;
         const auto extents = ArrayExtents{PROBLEM_SIZE};
         if constexpr(MAPPING == 0)
             return llama::mapping::AoS<ArrayExtents, Vector>{extents};

diff --git a/examples/bitpackfloat/bitpackfloat.cpp b/examples/bitpackfloat/bitpackfloat.cpp
@@ -21,7 +21,7 @@ auto main() -> int
     constexpr auto exponentBits = 5;
     constexpr auto mantissaBits = 13;
     const auto mapping
-        = llama::mapping::BitPackedFloatSoA{llama::ArrayExtents<llama::dyn>{N}, exponentBits, mantissaBits, Vector{}};
+        = llama::mapping::BitPackedFloatSoA{llama::ArrayExtents{N}, exponentBits, mantissaBits, Vector{}};
 
     auto view = llama::allocView(mapping);
 

diff --git a/examples/bitpackint/bitpackint.cpp b/examples/bitpackint/bitpackint.cpp
@@ -49,7 +49,7 @@ auto main() -> int
 
     // extract into a view of full size integers
     auto viewExtracted
-        = llama::allocViewUninitialized(llama::mapping::AoS<llama::ArrayExtents<llama::dyn>, Vector>{{N}});
+        = llama::allocViewUninitialized(llama::mapping::AoS<llama::ArrayExtentsDynamic<1>, Vector>{{N}});
     llama::copy(view, viewExtracted);
     if(!std::equal(view.begin(), view.end(), viewExtracted.begin(), viewExtracted.end()))
         fmt::print("ERROR: unpacked view is different\n");

diff --git a/examples/bufferguard/bufferguard.cpp b/examples/bufferguard/bufferguard.cpp
@@ -69,7 +69,7 @@ struct GuardMapping2D : llama::ArrayExtentsDynamic<2>
 
     template<std::size_t... RecordCoords>
     constexpr auto blobNrAndOffset(ArrayIndex ai, llama::RecordCoord<RecordCoords...> rc = {}) const
-        -> llama::NrAndOffset
+        -> llama::NrAndOffset<std::size_t>
     {
         // [0][0] is at left top
         const auto [row, col] = ai;
@@ -144,7 +144,8 @@ struct GuardMapping2D : llama::ArrayExtentsDynamic<2>
     }
 
 private:
-    constexpr auto offsetBlobNr(llama::NrAndOffset nao, std::size_t blobNrOffset) const -> llama::NrAndOffset
+    constexpr auto offsetBlobNr(llama::NrAndOffset<std::size_t> nao, std::size_t blobNrOffset) const
+        -> llama::NrAndOffset<std::size_t>
     {
         nao.nr += blobNrOffset;
         return nao;
@@ -202,8 +203,8 @@ void run(const std::string& mappingName)
 {
     std::cout << "\n===== Mapping " << mappingName << " =====\n\n";
 
-    constexpr auto rows = 7;
-    constexpr auto cols = 5;
+    constexpr std::size_t rows = 7;
+    constexpr std::size_t cols = 5;
     const auto extents = llama::ArrayExtents{rows, cols};
     const auto mapping = GuardMapping2D<Mapping, Vector>{extents};
     std::ofstream{"bufferguard_" + mappingName + ".svg"} << llama::toSvg(mapping);

diff --git a/examples/cuda/nbody/nbody.cu b/examples/cuda/nbody/nbody.cu
@@ -86,7 +86,7 @@ __global__ void updateSM(View particles)
     {
         constexpr auto sharedMapping = []
         {
-            using ArrayExtents = llama::ArrayExtents<SHARED_ELEMENTS_PER_BLOCK>;
+            using ArrayExtents = llama::ArrayExtents<int, SHARED_ELEMENTS_PER_BLOCK>;
             if constexpr(MappingSM == 0)
                 return llama::mapping::AoS<ArrayExtents, SharedMemoryParticle>{};
             if constexpr(MappingSM == 1)
@@ -111,15 +111,15 @@ __global__ void updateSM(View particles)
     const auto tbi = blockIdx.x;
 
     llama::One<Particle> pi = particles(ti);
-    for(std::size_t blockOffset = 0; blockOffset < PROBLEM_SIZE; blockOffset += SHARED_ELEMENTS_PER_BLOCK)
+    for(int blockOffset = 0; blockOffset < PROBLEM_SIZE; blockOffset += SHARED_ELEMENTS_PER_BLOCK)
     {
         LLAMA_INDEPENDENT_DATA
-        for(auto j = tbi; j < SHARED_ELEMENTS_PER_BLOCK; j += THREADS_PER_BLOCK)
+        for(int j = tbi; j < SHARED_ELEMENTS_PER_BLOCK; j += THREADS_PER_BLOCK)
             sharedView(j) = particles(blockOffset + j);
         __syncthreads();
 
         LLAMA_INDEPENDENT_DATA
-        for(auto j = std::size_t{0}; j < SHARED_ELEMENTS_PER_BLOCK; ++j)
+        for(int j = 0; j < SHARED_ELEMENTS_PER_BLOCK; ++j)
             pPInteraction(pi, sharedView(j));
         __syncthreads();
     }
@@ -133,7 +133,7 @@ __global__ void update(View particles)
 
     llama::One<Particle> pi = particles(ti);
     LLAMA_INDEPENDENT_DATA
-    for(auto j = std::size_t{0}; j < PROBLEM_SIZE; ++j)
+    for(int j = 0; j < PROBLEM_SIZE; ++j)
         pPInteraction(pi, particles(j));
     particles(ti)(tag::Vel{}) = pi(tag::Vel{});
 }
@@ -178,7 +178,7 @@ try
 
     auto mapping = []
     {
-        using ArrayExtents = llama::ArrayExtents<llama::dyn>;
+        using ArrayExtents = llama::ArrayExtentsDynamic<1, int>;
         const auto extents = ArrayExtents{PROBLEM_SIZE};
         if constexpr(Mapping == 0)
             return llama::mapping::AoS<ArrayExtents, Particle>{extents};
@@ -224,7 +224,7 @@ try
 
     std::default_random_engine engine;
     std::normal_distribution<FP> distribution(FP(0), FP(1));
-    for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
+    for(int i = 0; i < PROBLEM_SIZE; ++i)
     {
         llama::One<Particle> p;
         p(tag::Pos(), tag::X()) = distribution(engine);
@@ -267,7 +267,7 @@ try
 
     double sumUpdate = 0;
     double sumMove = 0;
-    for(std::size_t s = 0; s < STEPS; ++s)
+    for(int s = 0; s < STEPS; ++s)
     {
         if constexpr(RUN_UPATE)
         {
@@ -388,7 +388,7 @@ namespace manual
 
         std::default_random_engine engine;
         std::normal_distribution<FP> distribution(FP(0), FP(1));
-        for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
+        for(int i = 0; i < PROBLEM_SIZE; ++i)
         {
             hostPositions[i].x = distribution(engine);
             hostPositions[i].y = distribution(engine);
@@ -426,7 +426,7 @@ namespace manual
 
         double sumUpdate = 0;
         double sumMove = 0;
-        for(std::size_t s = 0; s < STEPS; ++s)
+        for(int s = 0; s < STEPS; ++s)
         {
             if constexpr(RUN_UPATE)
             {

diff --git a/examples/cuda/pitch/pitch.cu b/examples/cuda/pitch/pitch.cu
@@ -107,7 +107,7 @@ namespace llamaex
         template<std::size_t... RecordCoords>
         LLAMA_FN_HOST_ACC_INLINE constexpr auto blobNrAndOffset(
             typename Base::ArrayIndex ai,
-            RecordCoord<RecordCoords...> = {}) const -> NrAndOffset
+            RecordCoord<RecordCoords...> = {}) const -> NrAndOffset<std::size_t>
         {
             constexpr std::size_t flatFieldIndex =
 #ifdef __NVCC__
@@ -134,7 +134,7 @@ try
         prop.totalGlobalMem / 1024 / 1024,
         prop.sharedMemPerBlock / 1024);
 
-    const auto extents = llama::ArrayExtents{600, 800}; // height, width
+    const auto extents = llama::ArrayExtents<std::size_t, llama::dyn, llama::dyn>{600, 800}; // height, width
     const auto widthBytes = extents[1] * sizeof(RGB);
 
     const auto blockDim = dim3{16, 32, 1};

diff --git a/examples/nbody_benchmark/nbody.cpp b/examples/nbody_benchmark/nbody.cpp
@@ -85,7 +85,7 @@ void run(std::ostream& plotFile)
 
     const auto mapping = [&]
     {
-        using ArrayExtents = llama::ArrayExtents<llama::dyn>;
+        using ArrayExtents = llama::ArrayExtentsDynamic<1>;
         const auto extents = ArrayExtents{PROBLEM_SIZE};
         if constexpr(Mapping == 0)
             return llama::mapping::AoS<ArrayExtents, Particle>{extents};

diff --git a/examples/simpletest/simpletest.cpp b/examples/simpletest/simpletest.cpp
@@ -127,7 +127,7 @@ auto main() -> int
 try
 {
     // Defining two array dimensions
-    using ArrayExtents = llama::ArrayExtentsDynamic<2>;
+    using ArrayExtents = llama::ArrayExtentsDynamic<2, int>;
     // Setting the run time size of the array dimensions to 8192 * 8192
     auto extents = ArrayExtents{8192, 8192};
 
@@ -177,11 +177,11 @@ try
               << '\n';
 
     // iterating over the array dimensions at run time to do some stuff with the allocated data
-    for(size_t x = 0; x < extents[0]; ++x)
+    for(int x = 0; x < extents[0]; ++x)
         // telling the compiler that all data in the following loop is independent to each other and thus can be
         // vectorized
         LLAMA_INDEPENDENT_DATA
-    for(size_t y = 0; y < extents[1]; ++y)
+    for(int y = 0; y < extents[1]; ++y)
     {
         // Defining a functor for a given virtual record
         SetZeroFunctor<decltype(view(x, y))> szf{view(x, y)};
@@ -194,9 +194,9 @@ try
         view({x, y}) = double(x + y) / double(extents[0] + extents[1]);
     }
 
-    for(size_t x = 0; x < extents[0]; ++x)
+    for(int x = 0; x < extents[0]; ++x)
         LLAMA_INDEPENDENT_DATA
-    for(size_t y = 0; y < extents[1]; ++y)
+    for(int y = 0; y < extents[1]; ++y)
     {
         // Showing different options of access data with llama. Internally all do the same data- and mappingwise
         auto record = view(x, y);
@@ -209,9 +209,9 @@ try
     }
     double sum = 0.0;
     LLAMA_INDEPENDENT_DATA
-    for(size_t x = 0; x < extents[0]; ++x)
+    for(int x = 0; x < extents[0]; ++x)
         LLAMA_INDEPENDENT_DATA
-    for(size_t y = 0; y < extents[1]; ++y)
+    for(int y = 0; y < extents[1]; ++y)
         sum += view(x, y)(llama::RecordCoord<1, 0>{});
     std::cout << "Sum: " << sum << '\n';
 

diff --git a/examples/vectoradd/vectoradd.cpp b/examples/vectoradd/vectoradd.cpp
@@ -48,7 +48,7 @@ namespace usellama
 
         const auto mapping = [&]
         {
-            using ArrayExtents = llama::ArrayExtents<llama::dyn>;
+            using ArrayExtents = llama::ArrayExtentsDynamic<1>;
             const auto extents = ArrayExtents{PROBLEM_SIZE};
             if constexpr(MAPPING == 0)
                 return llama::mapping::AoS{extents, Vector{}};