diff --git a/examples/alpaka/daxpy/CMakeLists.txt b/examples/alpaka/daxpy/CMakeLists.txt index e9efca28d8..ca0945c24b 100644 --- a/examples/alpaka/daxpy/CMakeLists.txt +++ b/examples/alpaka/daxpy/CMakeLists.txt @@ -9,7 +9,7 @@ if (NOT TARGET llama::llama) find_package(llama REQUIRED) endif() find_package(alpaka 1.0 REQUIRED) -alpaka_add_executable(${PROJECT_NAME} daxpy.cpp ../../common/Stopwatch.hpp ../../common/hostname.hpp) +alpaka_add_executable(${PROJECT_NAME} daxpy.cpp ../../common/Stopwatch.hpp ../../common/env.hpp) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama OpenMP::OpenMP_CXX alpaka::alpaka) diff --git a/examples/alpaka/daxpy/daxpy.cpp b/examples/alpaka/daxpy/daxpy.cpp index 66e871df91..de507b7c64 100644 --- a/examples/alpaka/daxpy/daxpy.cpp +++ b/examples/alpaka/daxpy/daxpy.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "../../common/Stopwatch.hpp" -#include "../../common/hostname.hpp" +#include "../../common/env.hpp" #include #include @@ -156,26 +156,20 @@ void daxpyAlpakaLlama(std::string mappingName, std::ofstream& plotFile, Mapping auto main() -> int try { - const auto numThreads = static_cast(omp_get_max_threads()); - const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) - affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; + const auto env = common::captureEnv(); fmt::print( - R"({}Mi doubles ({}MiB data) -Threads: {} -Affinity: {} -)", + "{}Mi doubles ({}MiB data)\n{}\n", problemSize / 1024 / 1024, problemSize * sizeof(double) / 1024 / 1024, - numThreads, - affinity); + env); std::ofstream plotFile{"daxpy.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); plotFile << fmt::format( R"(#!/usr/bin/gnuplot -p -# threads: {} affinity: {} -set title "daxpy CPU {}Mi doubles on {}" +# {} +set title "daxpy CPU {}Mi doubles" set style data histograms set style fill solid set xtics rotate by 45 right @@ -184,10 +178,8 @@ set yrange [0:*] set ylabel "runtime [s]" $data << EOD )", - numThreads, - affinity, - problemSize / 1024 / 1024, - common::hostname()); + env, + problemSize / 1024 / 1024); daxpy(plotFile); diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 6512a2051a..cc8573ec28 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "../../common/Stopwatch.hpp" -#include "../../common/hostname.hpp" +#include "../../common/env.hpp" #include #include @@ -356,21 +356,21 @@ try using Size = int; using Acc = alpaka::ExampleDefaultAcc; + const auto env = common::captureEnv(); std::cout << problemSize / 1000 << "k particles (" << problemSize * llama::sizeOf / 1024 << "kiB)\n" << "Caching " << threadsPerBlock << " particles (" << threadsPerBlock * llama::sizeOf / 1024 << " kiB) in shared memory\n" << "Reducing on " << elementsPerThread << " particles per thread\n" - << "Using " << threadsPerBlock << " threads per block\n"; - const auto dev = alpaka::getDevByIdx(alpaka::Platform{}, 0); - const auto props = alpaka::getAccDevProps(dev); - std::cout << "Running on " << alpaka::getName(dev) << ", " << props.m_sharedMemSizeBytes / 1024 << "kiB SM\n"; + << "Using " << threadsPerBlock << " threads per block\n" + << env << '\n'; std::cout << std::fixed; std::ofstream plotFile{"nbody_alpaka.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); plotFile << fmt::format( R"(#!/usr/bin/gnuplot -p -set title "nbody alpaka {}ki particles on {} on {}" +# {} +set title "nbody alpaka {}ki particles on {}" set style data histograms set style fill solid set xtics rotate by 45 right @@ -382,9 +382,9 @@ set y2label "move runtime [s]" set y2tics auto $data << EOD )", + env, problemSize / 1024, - alpaka::getAccName(), - common::hostname()); + alpaka::getAccName()); plotFile << "\"\"\t\"update\"\t\"move\"\n"; run(plotFile); diff --git a/examples/alpaka/pic/pic.cpp b/examples/alpaka/pic/pic.cpp index cd52b4ef6c..de10611712 100644 --- a/examples/alpaka/pic/pic.cpp +++ b/examples/alpaka/pic/pic.cpp @@ -5,7 +5,7 @@ #if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_CUDA_ONLY_MODE) # define ALPAKA_ACC_GPU_CUDA_ONLY_MODE #endif -#include "../../common/hostname.hpp" +#include "../../common/env.hpp" #include #include @@ -877,29 +877,18 @@ void run(std::ostream& plotFile) auto main() -> int try { - const auto numThreads = static_cast(omp_get_max_threads()); - const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) - affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; - using Acc = alpaka::ExampleDefaultAcc; - auto accName = alpaka::getName(alpaka::getDevByIdx(alpaka::Platform{}, 0u)); - while(static_cast(std::isspace(accName.back()))) - accName.pop_back(); - fmt::print( - "Running {} steps with grid {}x{} and {}k particles on {}\n", - nsteps, - gridX, - gridY, - numpart / 1000, - accName); + const auto env = common::captureEnv(); + const auto accName = common::trim(alpaka::getName(alpaka::getDevByIdx(alpaka::Platform{}, 0u))); + fmt::print("Running {} steps with grid {}x{} and {}k particles\n{}\n", nsteps, gridX, gridY, numpart / 1000, env); std::ofstream plotFile{"pic.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); fmt::print( plotFile, R"aa(#!/usr/bin/gnuplot -p -# threads: {} affinity: {} -set title "PIC grid {}x{} {}k particles on {} ({})" +# {} +set title "PIC grid {}x{} {}k particles on {}" set style data histograms set style fill solid set xtics rotate by 45 right @@ -909,14 +898,11 @@ set ylabel "runtime [s]" $data << EOD "" "clr J" "integr" " dep J" " bnd J" "adv B1" "bnd B1" " adv E" " bnd E" "adv B2" "bnd B2" "total" )aa", - numThreads, - affinity, + env, gridX, gridY, numpart / 1000, - accName, - common::hostname()); - + accName); // FieldMapping: AoS RM, AoS CM, AoS Mo, // SoA RM, SoA CM, SoA Mo, diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp index a70667c51e..03ea555457 100644 --- a/examples/alpaka/vectoradd/vectoradd.cpp +++ b/examples/alpaka/vectoradd/vectoradd.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: CC0-1.0 #include "../../common/Stopwatch.hpp" -#include "../../common/hostname.hpp" +#include "../../common/env.hpp" #include #include @@ -189,14 +189,18 @@ catch(const std::exception& e) auto main() -> int { + using Acc = alpaka::ExampleDefaultAcc, Size>; + const auto env = common::captureEnv(); std::cout << problemSize / 1000 / 1000 << "M values " - << "(" << problemSize * sizeof(float) / 1024 << "kiB)\n"; + << "(" << problemSize * sizeof(float) / 1024 << "kiB)\n" + << env << '\n'; std::ofstream plotFile{"vectoradd_alpaka.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); plotFile << fmt::format( R"(#!/usr/bin/gnuplot -p -set title "vectoradd alpaka {}Mi elements on {} on {}" +# {} +set title "vectoradd alpaka {}Mi elements on {}" set style data histograms set style fill solid set xtics rotate by 45 right @@ -204,9 +208,9 @@ set yrange [0:*] set ylabel "runtime [s]" $data << EOD )", + env, problemSize / 1024 / 1024, - alpaka::getAccName, Size>>(), - common::hostname()); + alpaka::getAccName()); boost::mp11::mp_for_each>([&](auto ic) { run(plotFile); }); diff --git a/examples/common/env.hpp b/examples/common/env.hpp new file mode 100644 index 0000000000..f21638501a --- /dev/null +++ b/examples/common/env.hpp @@ -0,0 +1,125 @@ +// Copyright 2021 Bernhard Manfred Gruber +// SPDX-License-Identifier: LGPL-3.0-or-later + +#pragma once + +#include +#if __has_include() +# include +#endif +#ifdef ALPAKA_DEBUG // defined when the cmake target links to alpaka +# include +#endif +#include +#include +#ifdef _WIN32 +# define NOMINMAX +# define WIN32_LEAN_AND_MEAN +# include +# pragma comment(lib, "ws2_32") +#else +# include +#endif + +namespace common +{ + // We used boost::asio::ip::host_name() originally, but it complicated the disassembly and requires asio as + // additional dependency. + inline auto hostname() -> std::string + { + char name[256]; + ::gethostname(name, 256); + return name; + } + + inline auto trim(std::string s) -> std::string + { + const auto pred = [](char c) { return std::isspace(c) == 0; }; + s.erase(std::find_if(rbegin(s), rend(s), pred).base(), end(s)); + s.erase(begin(s), std::find_if(begin(s), end(s), pred)); + return s; + } + + template + inline auto captureEnv() -> std::string + { + std::string env; + + // hostname + env += fmt::format("Host: {}", hostname()); + + // OpenMP +#ifdef _OPENMP + const auto maxThreads = static_cast(omp_get_max_threads()); + const char* ompProcBind = std::getenv("OMP_PROC_BIND"); // NOLINT(concurrency-mt-unsafe) + const char* ompPlaces = std::getenv("OMP_PLACES"); // NOLINT(concurrency-mt-unsafe) + ompProcBind = ompProcBind == nullptr ? "no - PLEASE DEFINE ENV.VAR. OMP_PROC_BIND!" : ompProcBind; + ompPlaces = ompPlaces == nullptr ? "nothing - PLEASE DEFINE ENV.VAR. OMP_PLACES!" : ompPlaces; + env += fmt::format("; OpenMP: max {} threads, bound {}, to {}", maxThreads, ompProcBind, ompPlaces); +#endif + + // SIMD + std::string simdArch = +#if defined(__AVX512F__) + "AVX512F"; +#elif defined(__AVX2__) + "AVX2"; +#elif defined(__AVX__) + "AVX"; +#elif defined(__SSE__SSE4_2__) + "SSE4.2"; +#elif defined(__SSE__SSE4_1__) + "SSE4.1"; +#elif defined(__SSE3__) + "SSE3"; +#elif defined(__SSE2__) + "SSE2"; +#elif defined(__ARM_NEON__) + "NEON"; +#elif defined(__ALTIVEC__) + "ALTIVEC"; +#else + "unknown"; +#endif + +#ifdef __FMA__ + simdArch += "+FMA"; +#endif + env += fmt::format("; SIMD: {}", simdArch); + + // alpaka +#ifdef ALPAKA_DEBUG // defined when the cmake target links to alpaka + if constexpr(!std::is_void_v) + { + using Acc = AlpakaAcc; + auto accName = alpaka::getAccName(); + accName.erase(begin(accName) + accName.find_first_of('<'), end(accName)); // drop template arguments + const auto dev = getDevByIdx(alpaka::Platform{}, 0u); + const auto devName = trim(getName(dev)); // TODO(bgruber): drop trim after fix lands in alpaka + const auto devProps = alpaka::getAccDevProps(dev); + env += fmt::format( + "; alpaka acc: {}, dev[0]: {}, SMem: {}KiB", + accName, + devName, + devProps.m_sharedMemSizeBytes / 1024); + } +#endif + + // CUDA +#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDACC__)) + { + int device; + cudaGetDevice(&device); + cudaDeviceProp prop{}; + cudaGetDeviceProperties(&prop, device); + env += fmt::format( + "; CUDA dev: {}, {}MiB GM, {}KiB SM", + prop.name, + prop.totalGlobalMem / 1024 / 1024, + prop.sharedMemPerBlock / 1024); + } +#endif + + return env; + } +} // namespace common diff --git a/examples/common/hostname.hpp b/examples/common/hostname.hpp deleted file mode 100644 index c7aeb371b4..0000000000 --- a/examples/common/hostname.hpp +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2021 Bernhard Manfred Gruber -// SPDX-License-Identifier: LGPL-3.0-or-later - -#pragma once - -#include -#ifdef _WIN32 -# define NOMINMAX -# define WIN32_LEAN_AND_MEAN -# include -# pragma comment(lib, "ws2_32") -#else -# include -#endif - -namespace common -{ - // We used boost::asio::ip::host_name() originally, but it complicated the disassembly and requires asio as - // additional dependency. - inline auto hostname() -> std::string - { - char name[256]; - ::gethostname(name, 256); - return name; - } -} // namespace common diff --git a/examples/cuda/nbody/nbody.cu b/examples/cuda/nbody/nbody.cu index 514afd8567..503aaf2e26 100644 --- a/examples/cuda/nbody/nbody.cu +++ b/examples/cuda/nbody/nbody.cu @@ -2,7 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "../../common/Stopwatch.hpp" -#include "../../common/hostname.hpp" +#include "../../common/env.hpp" #include #include @@ -523,25 +523,24 @@ namespace gpugems auto main() -> int try { + const auto env = common::captureEnv(); std::cout << problemSize / 1024 << "ki particles (" << problemSize * llama::sizeOf / 1024 << "kiB)\n" << "Caching " << sharedElementsPerBlock << " particles (" << sharedElementsPerBlock * llama::sizeOf / 1024 << " kiB) in shared memory\n" - << "Using " << threadsPerBlock << " threads per block\n"; + << "Using " << threadsPerBlock << " threads per block\n" + << env << '\n'; + std::cout << std::fixed; + int device = 0; cudaGetDevice(&device); cudaDeviceProp prop{}; cudaGetDeviceProperties(&prop, device); - fmt::print( - "Running on {}, {}MiB GM, {}kiB SM\n", - prop.name, - prop.totalGlobalMem / 1024 / 1024, - prop.sharedMemPerBlock / 1024); - std::cout << std::fixed; std::ofstream plotFile{"nbody_cuda.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); plotFile << fmt::format( R"(#!/usr/bin/gnuplot -p +# {} set title "nbody CUDA {}ki particles on {}" set style data histograms set style fill solid @@ -554,8 +553,9 @@ set y2label "move runtime [s]" set y2tics auto $data << EOD )", + env, problemSize / 1024, - common::hostname()); + prop.name); plotFile << "\"\"\t\"update\"\t\"move\"\n"; using namespace boost::mp11; diff --git a/examples/cuda/viewcopy/viewcopy.cu b/examples/cuda/viewcopy/viewcopy.cu index 348218b0c5..a36095d309 100644 --- a/examples/cuda/viewcopy/viewcopy.cu +++ b/examples/cuda/viewcopy/viewcopy.cu @@ -1,5 +1,5 @@ #include "../../common/Stopwatch.hpp" -#include "../../common/hostname.hpp" +#include "../../common/env.hpp" #include "../../common/ttjet_13tev_june2019.hpp" #include diff --git a/examples/nbody/nbody.cpp b/examples/nbody/nbody.cpp index 54a96d0f59..123d46bb7d 100644 --- a/examples/nbody/nbody.cpp +++ b/examples/nbody/nbody.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "../common/Stopwatch.hpp" -#include "../common/hostname.hpp" +#include "../common/env.hpp" #include #include @@ -1549,36 +1549,15 @@ auto arePositionsClose(const std::vector& finalPositions) -> bool auto main() -> int try { -#ifdef HAVE_XSIMD - using Simd = xsimd::batch; - // using Simd = xsimd::make_sized_batch_t; - constexpr auto simdLanes = Simd::size; -#else - constexpr auto simdLanes = 1; -#endif - - const auto numThreads = static_cast(omp_get_max_threads()); - const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) - affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; - - fmt::print( - R"({}ki particles ({}kiB) -Threads: {} -Affinity: {} -SIMD lanes: {} -)", - problemSize / 1024, - problemSize * sizeof(FP) * 7 / 1024, - numThreads, - affinity, - simdLanes); + const auto env = common::captureEnv(); + fmt::print("{}ki particles ({}kiB)\n{}\n", problemSize / 1024, problemSize * sizeof(FP) * 7 / 1024, env); std::ofstream plotFile{"nbody.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); plotFile << fmt::format( R"(#!/usr/bin/gnuplot -p -# threads: {} affinity: {} SIMD lanes: {} -set title "nbody CPU {}ki particles on {}" +# {} +set title "nbody CPU {}ki particles" set style data histograms set style fill solid set xtics rotate by 45 right @@ -1590,11 +1569,8 @@ set y2label "move runtime [s]" set y2tics auto $data << EOD )", - numThreads, - affinity, - simdLanes, - problemSize / 1024, - common::hostname()); + env, + problemSize / 1024); plotFile << "\"\"\t\"update\"\t\"move\"\n"; // Note: @@ -1635,6 +1611,7 @@ set y2tics auto finalPositions.push_back(manualAoSoAManualAVX::main(plotFile, false)); #endif #ifdef HAVE_XSIMD + using Simd = xsimd::batch; const auto maxThreads = std::thread::hardware_concurrency(); for(int threads = 1; threads <= std::thread::hardware_concurrency(); threads *= 2) { diff --git a/examples/sycl/nbody/nbody.cpp b/examples/sycl/nbody/nbody.cpp index 3dd01da81e..53326d4e71 100644 --- a/examples/sycl/nbody/nbody.cpp +++ b/examples/sycl/nbody/nbody.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "../../common/Stopwatch.hpp" -#include "../../common/hostname.hpp" +#include "../../common/env.hpp" #include #include diff --git a/examples/vectoradd/vectoradd.cpp b/examples/vectoradd/vectoradd.cpp index 429a44bcab..f87be0a067 100644 --- a/examples/vectoradd/vectoradd.cpp +++ b/examples/vectoradd/vectoradd.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "../common/Stopwatch.hpp" -#include "../common/hostname.hpp" +#include "../common/env.hpp" #include #include @@ -315,14 +315,17 @@ namespace manualAoSoA auto main() -> int try { + const auto env = common::captureEnv(); std::cout << problemSize / 1000 / 1000 << "M values " - << "(" << problemSize * sizeof(float) / 1024 << "kiB)\n"; + << "(" << problemSize * sizeof(float) / 1024 << "kiB)\n" + << env << '\n'; std::ofstream plotFile{"vectoradd.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); plotFile << fmt::format( R"(#!/usr/bin/gnuplot -p -set title "vectoradd CPU {}Mi elements on {}" +# {} +set title "vectoradd CPU {}Mi elements" set style data histograms set style fill solid set xtics rotate by 45 right @@ -330,8 +333,8 @@ set yrange [0:*] set ylabel "runtime [s]" $data << EOD )", - problemSize / 1024 / 1024, - common::hostname()); + env, + problemSize / 1024 / 1024); int r = 0; boost::mp11::mp_for_each>([&](auto ic) diff --git a/examples/viewcopy/viewcopy.cpp b/examples/viewcopy/viewcopy.cpp index bca79565ff..d2c4f6059c 100644 --- a/examples/viewcopy/viewcopy.cpp +++ b/examples/viewcopy/viewcopy.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "../common/Stopwatch.hpp" -#include "../common/hostname.hpp" +#include "../common/env.hpp" #include "../common/ttjet_13tev_june2019.hpp" #include @@ -127,25 +127,16 @@ auto prepareViewAndHash(Mapping mapping) auto main() -> int try { + const auto env = common::captureEnv(); const auto dataSize = llama::product(extents) * llama::sizeOf; - const auto numThreads = static_cast(omp_get_max_threads()); - const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) - affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; - fmt::print( - R"(Data size: {}MiB -Threads: {} -Affinity: {} -)", - dataSize / 1024 / 1024, - numThreads, - affinity); + fmt::print("Data size: {}MiB\n{}\n", dataSize / 1024 / 1024, env); std::ofstream plotFile{"viewcopy.sh"}; plotFile.exceptions(std::ios::badbit | std::ios::failbit); plotFile << fmt::format( R"(#!/usr/bin/gnuplot -p -# threads: {} affinity: {} -set title "viewcopy CPU {}MiB particles on {}" +# {} +set title "viewcopy CPU {}MiB particles" set style data histograms set style fill solid set xtics rotate by 45 right @@ -153,10 +144,8 @@ set key out top center maxrows 4 set ylabel "throughput [GiB/s]" $data << EOD )", - numThreads, - affinity, - dataSize / 1024 / 1024, - common::hostname()); + env, + dataSize / 1024 / 1024); plotFile << "\"\"\t\"memcpy\"\t\"memcpy\\\\\\_avx2\"\t\"memcpy(p)\"\t\"memcpy\\\\\\_avx2(p)\"\t\"naive " "copy\"\t\"std::copy\"\t\"aosoa copy(r)\"\t\"aosoa copy(w)\"\t\"naive copy(p)\"\t\"aosoa "