alpaka-group · bernhardmgruber · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023 · Nov 21, 2023
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -189,9 +189,6 @@ jobs:
             add_toolchain_repo: true
             asan: ON
             install_extra: g++-13
-          - name: build-ubuntu-clang11
-            cxx: clang++-11
-            install_extra: clang-11 libomp-11-dev
           - name: build-ubuntu-clang12
             cxx: clang++-12
             install_extra: clang-12 libomp-12-dev

diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ The following compilers are supported by LLAMA and tested as part of our CI:
 
 | Linux                                                                                        | Windows                                             | MacOS                            |
 |----------------------------------------------------------------------------------------------|-----------------------------------------------------|----------------------------------|
-| g++ 9 - 13 </br> clang++ 11 - 17 </br> icpx (latest) </br> nvc++ 23.5 </br> nvcc 11.6 - 12.3 | Visual Studio 2022 </br> (latest on GitHub actions) | clang++ </br> (latest from brew) |
+| g++ 9 - 13 </br> clang++ 12 - 17 </br> icpx (latest) </br> nvc++ 23.5 </br> nvcc 11.6 - 12.3 | Visual Studio 2022 </br> (latest on GitHub actions) | clang++ </br> (latest from brew) |
 
 
 Single header

diff --git a/examples/alpaka/asyncblur/CMakeLists.txt b/examples/alpaka/asyncblur/CMakeLists.txt
@@ -8,6 +8,6 @@ if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 find_package(alpaka 1.0 REQUIRED)
-alpaka_add_executable(${PROJECT_NAME} asyncblur.cpp ../../common/alpakaHelpers.hpp ../../common/Stopwatch.hpp)
+alpaka_add_executable(${PROJECT_NAME} asyncblur.cpp ../../common/Stopwatch.hpp)
 target_include_directories(${PROJECT_NAME} SYSTEM PRIVATE ../../../thirdparty/stb/include)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka)
diff --git a/examples/alpaka/asyncblur/asyncblur.cpp b/examples/alpaka/asyncblur/asyncblur.cpp
@@ -30,6 +30,39 @@ constexpr auto elemsPerBlock = 16; /// number of elements per direction(!) every
 
 using FP = float;
 
+constexpr auto threadElemDistMinElem = 2;
+
+/** Returns a good guess for an optimal number of threads and elements in a
+ *  block based on the total number of elements in the block.
+ */
+template<typename T_Acc, std::size_t BlockSize, std::size_t HardwareThreads>
+struct ThreadsElemsDistribution
+{
+    /// number of elements per thread
+    static constexpr std::size_t elemCount = BlockSize;
+    /// number of threads per block
+    static constexpr std::size_t threadCount = 1u;
+};
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+template<std::size_t BlockSize, std::size_t HardwareThreads, typename Dim, typename Size>
+struct ThreadsElemsDistribution<alpaka::AccGpuCudaRt<Dim, Size>, BlockSize, HardwareThreads>
+{
+    static constexpr std::size_t elemCount = threadElemDistMinElem;
+    static constexpr std::size_t threadCount = BlockSize / threadElemDistMinElem;
+};
+#endif
+
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED
+template<std::size_t BlockSize, std::size_t HardwareThreads, typename Dim, typename Size>
+struct ThreadsElemsDistribution<alpaka::AccCpuOmp2Threads<Dim, Size>, BlockSize, HardwareThreads>
+{
+    static constexpr std::size_t elemCount = (BlockSize + HardwareThreads - 1u) / HardwareThreads;
+    static constexpr std::size_t threadCount = HardwareThreads;
+};
+#endif
+
+
 // clang-format off
 namespace tag
 {
@@ -157,7 +190,7 @@ try
     int bufferY = defaultImgY + 2 * kernelSize;
 
     constexpr int hardwareThreads = 2; // relevant for OpenMP2Threads
-    using Distribution = common::ThreadsElemsDistribution<Acc, elemsPerBlock, hardwareThreads>;
+    using Distribution = ThreadsElemsDistribution<Acc, elemsPerBlock, hardwareThreads>;
     constexpr int elemCount = Distribution::elemCount;
     constexpr int threadCount = Distribution::threadCount;
 

diff --git a/examples/alpaka/daxpy/CMakeLists.txt b/examples/alpaka/daxpy/CMakeLists.txt
@@ -9,7 +9,7 @@ if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 find_package(alpaka 1.0 REQUIRED)
-alpaka_add_executable(${PROJECT_NAME} daxpy.cpp ../../common/Stopwatch.hpp ../../common/hostname.hpp)
+alpaka_add_executable(${PROJECT_NAME} daxpy.cpp ../../common/Stopwatch.hpp ../../common/env.hpp)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama OpenMP::OpenMP_CXX alpaka::alpaka)
 

diff --git a/examples/alpaka/daxpy/daxpy.cpp b/examples/alpaka/daxpy/daxpy.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 
 #include "../../common/Stopwatch.hpp"
-#include "../../common/hostname.hpp"
+#include "../../common/env.hpp"
 
 #include <alpaka/alpaka.hpp>
 #include <alpaka/example/ExampleDefaultAcc.hpp>
@@ -156,26 +156,20 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping
 auto main() -> int
 try
 {
-    const auto numThreads = static_cast<std::size_t>(omp_get_max_threads());
-    const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe)
-    affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity;
+    const auto env = common::captureEnv();
 
     fmt::print(
-        R"({}Mi doubles ({}MiB data)
-Threads: {}
-Affinity: {}
-)",
+        "{}Mi doubles ({}MiB data)\n{}\n",
         problemSize / 1024 / 1024,
         problemSize * sizeof(double) / 1024 / 1024,
-        numThreads,
-        affinity);
+        env);
 
     std::ofstream plotFile{"daxpy.sh"};
     plotFile.exceptions(std::ios::badbit | std::ios::failbit);
     plotFile << fmt::format(
         R"(#!/usr/bin/gnuplot -p
-# threads: {} affinity: {}
-set title "daxpy CPU {}Mi doubles on {}"
+# {}
+set title "daxpy CPU {}Mi doubles"
 set style data histograms
 set style fill solid
 set xtics rotate by 45 right
@@ -184,10 +178,8 @@ set yrange [0:*]
 set ylabel "runtime [s]"
 $data << EOD
 )",
-        numThreads,
-        affinity,
-        problemSize / 1024 / 1024,
-        common::hostname());
+        env,
+        problemSize / 1024 / 1024);
 
     daxpy(plotFile);
 

diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 
 #include "../../common/Stopwatch.hpp"
-#include "../../common/hostname.hpp"
+#include "../../common/env.hpp"
 
 #include <alpaka/alpaka.hpp>
 #include <alpaka/example/ExampleDefaultAcc.hpp>
@@ -356,21 +356,21 @@ try
     using Size = int;
     using Acc = alpaka::ExampleDefaultAcc<Dim, Size>;
 
+    const auto env = common::captureEnv<Acc>();
     std::cout << problemSize / 1000 << "k particles (" << problemSize * llama::sizeOf<Particle> / 1024 << "kiB)\n"
               << "Caching " << threadsPerBlock << " particles (" << threadsPerBlock * llama::sizeOf<Particle> / 1024
               << " kiB) in shared memory\n"
               << "Reducing on " << elementsPerThread << " particles per thread\n"
-              << "Using " << threadsPerBlock << " threads per block\n";
-    const auto dev = alpaka::getDevByIdx(alpaka::Platform<Acc>{}, 0);
-    const auto props = alpaka::getAccDevProps<Acc>(dev);
-    std::cout << "Running on " << alpaka::getName(dev) << ", " << props.m_sharedMemSizeBytes / 1024 << "kiB SM\n";
+              << "Using " << threadsPerBlock << " threads per block\n"
+              << env << '\n';
     std::cout << std::fixed;
 
     std::ofstream plotFile{"nbody_alpaka.sh"};
     plotFile.exceptions(std::ios::badbit | std::ios::failbit);
     plotFile << fmt::format(
         R"(#!/usr/bin/gnuplot -p
-set title "nbody alpaka {}ki particles on {} on {}"
+# {}
+set title "nbody alpaka {}ki particles on {}"
 set style data histograms
 set style fill solid
 set xtics rotate by 45 right
@@ -382,9 +382,9 @@ set y2label "move runtime [s]"
 set y2tics auto
 $data << EOD
 )",
+        env,
         problemSize / 1024,
-        alpaka::getAccName<Acc>(),
-        common::hostname());
+        alpaka::getAccName<Acc>());
     plotFile << "\"\"\t\"update\"\t\"move\"\n";
 
     run<Acc, AoS, AoS>(plotFile);

diff --git a/examples/alpaka/pic/pic.cpp b/examples/alpaka/pic/pic.cpp
@@ -5,7 +5,7 @@
 #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE)
 #    define ALPAKA_ACC_GPU_CUDA_ONLY_MODE
 #endif
-#include "../../common/hostname.hpp"
+#include "../../common/env.hpp"
 
 #include <alpaka/alpaka.hpp>
 #include <alpaka/example/ExampleDefaultAcc.hpp>
@@ -877,29 +877,18 @@ void run(std::ostream& plotFile)
 auto main() -> int
 try
 {
-    const auto numThreads = static_cast<std::size_t>(omp_get_max_threads());
-    const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe)
-    affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity;
-
     using Acc = alpaka::ExampleDefaultAcc<Dim, Size>;
-    auto accName = alpaka::getName(alpaka::getDevByIdx(alpaka::Platform<Acc>{}, 0u));
-    while(static_cast<bool>(std::isspace(accName.back())))
-        accName.pop_back();
-    fmt::print(
-        "Running {} steps with grid {}x{} and {}k particles on {}\n",
-        nsteps,
-        gridX,
-        gridY,
-        numpart / 1000,
-        accName);
+    const auto env = common::captureEnv<Acc>();
+    const auto accName = common::trim(alpaka::getName(alpaka::getDevByIdx(alpaka::Platform<Acc>{}, 0u)));
+    fmt::print("Running {} steps with grid {}x{} and {}k particles\n{}\n", nsteps, gridX, gridY, numpart / 1000, env);
 
     std::ofstream plotFile{"pic.sh"};
     plotFile.exceptions(std::ios::badbit | std::ios::failbit);
     fmt::print(
         plotFile,
         R"aa(#!/usr/bin/gnuplot -p
-# threads: {} affinity: {}
-set title "PIC grid {}x{} {}k particles on {} ({})"
+# {}
+set title "PIC grid {}x{} {}k particles on {}"
 set style data histograms
 set style fill solid
 set xtics rotate by 45 right
@@ -909,14 +898,11 @@ set ylabel "runtime [s]"
 $data << EOD
 ""	"clr J"	"integr"	" dep J"	" bnd J"	"adv B1"	"bnd B1"	" adv E"	" bnd E"	"adv B2"	"bnd B2"	"total"
 )aa",
-        numThreads,
-        affinity,
+        env,
         gridX,
         gridY,
         numpart / 1000,
-        accName,
-        common::hostname());
-
+        accName);
 
     // FieldMapping: AoS RM, AoS CM, AoS Mo,
     //               SoA RM, SoA CM, SoA Mo,

diff --git a/examples/alpaka/vectoradd/CMakeLists.txt b/examples/alpaka/vectoradd/CMakeLists.txt
@@ -8,5 +8,5 @@ if (NOT TARGET llama::llama)
 	find_package(llama REQUIRED)
 endif()
 find_package(alpaka 1.0 REQUIRED)
-alpaka_add_executable(${PROJECT_NAME} vectoradd.cpp ../../common/alpakaHelpers.hpp ../../common/Stopwatch.hpp)
+alpaka_add_executable(${PROJECT_NAME} vectoradd.cpp ../../common/Stopwatch.hpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama alpaka::alpaka)
diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp
@@ -2,7 +2,7 @@
 // SPDX-License-Identifier: CC0-1.0
 
 #include "../../common/Stopwatch.hpp"
-#include "../../common/hostname.hpp"
+#include "../../common/env.hpp"
 
 #include <alpaka/alpaka.hpp>
 #include <alpaka/example/ExampleDefaultAcc.hpp>
@@ -189,24 +189,28 @@ catch(const std::exception& e)
 
 auto main() -> int
 {
+    using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, Size>;
+    const auto env = common::captureEnv<Acc>();
     std::cout << problemSize / 1000 / 1000 << "M values "
-              << "(" << problemSize * sizeof(float) / 1024 << "kiB)\n";
+              << "(" << problemSize * sizeof(float) / 1024 << "kiB)\n"
+              << env << '\n';
 
     std::ofstream plotFile{"vectoradd_alpaka.sh"};
     plotFile.exceptions(std::ios::badbit | std::ios::failbit);
     plotFile << fmt::format(
         R"(#!/usr/bin/gnuplot -p
-set title "vectoradd alpaka {}Mi elements on {} on {}"
+# {}
+set title "vectoradd alpaka {}Mi elements on {}"
 set style data histograms
 set style fill solid
 set xtics rotate by 45 right
 set yrange [0:*]
 set ylabel "runtime [s]"
 $data << EOD
 )",
+        env,
         problemSize / 1024 / 1024,
-        alpaka::getAccName<alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, Size>>(),
-        common::hostname());
+        alpaka::getAccName<Acc>());
 
     boost::mp11::mp_for_each<boost::mp11::mp_iota_c<6>>([&](auto ic) { run<decltype(ic)::value>(plotFile); });
 

diff --git a/examples/common/alpakaHelpers.hpp b/examples/common/alpakaHelpers.hpp