diff --git a/CMakeLists.txt b/CMakeLists.txt index 19afbf14e8..cbadde91e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,7 +86,6 @@ if (LLAMA_BUILD_EXAMPLES) add_subdirectory("examples/bytesplit") add_subdirectory("examples/bitpackint") add_subdirectory("examples/bitpackfloat") - add_subdirectory("examples/daxpy") # alpaka examples find_package(alpaka 0.7.0 QUIET) @@ -95,6 +94,7 @@ if (LLAMA_BUILD_EXAMPLES) add_subdirectory("examples/alpaka/vectoradd") add_subdirectory("examples/alpaka/asyncblur") add_subdirectory("examples/alpaka/pic") + add_subdirectory("examples/alpaka/daxpy") elseif() message(WARNING "Could not find alpaka. Alpaka examples are disabled.") endif() diff --git a/examples/daxpy/CMakeLists.txt b/examples/alpaka/daxpy/CMakeLists.txt similarity index 69% rename from examples/daxpy/CMakeLists.txt rename to examples/alpaka/daxpy/CMakeLists.txt index 180e233963..72dc69a7c6 100644 --- a/examples/daxpy/CMakeLists.txt +++ b/examples/alpaka/daxpy/CMakeLists.txt @@ -1,15 +1,15 @@ cmake_minimum_required (VERSION 3.15) -project(llama-daxpy CXX) +project(llama-alpaka-daxpy CXX) #find_package(Vc QUIET) find_package(OpenMP REQUIRED) if (NOT TARGET llama::llama) find_package(llama REQUIRED) endif() - -add_executable(${PROJECT_NAME} daxpy.cpp) +find_package(alpaka 0.7.0 REQUIRED) +alpaka_add_executable(${PROJECT_NAME} daxpy.cpp ../../common/Stopwatch.hpp ../../common/hostname.hpp) target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17) -target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama OpenMP::OpenMP_CXX) +target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama OpenMP::OpenMP_CXX alpaka::alpaka) if (MSVC) target_compile_options(${PROJECT_NAME} PRIVATE /arch:AVX2 /fp:fast) diff --git a/examples/alpaka/daxpy/daxpy.cpp b/examples/alpaka/daxpy/daxpy.cpp new file mode 100644 index 0000000000..8a2d05a32b --- /dev/null +++ b/examples/alpaka/daxpy/daxpy.cpp @@ -0,0 +1,230 @@ +#include "../../common/Stopwatch.hpp" +#include "../../common/hostname.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +constexpr auto PROBLEM_SIZE = std::size_t{1024 * 1024 * 128}; +constexpr auto BLOCK_SIZE = std::size_t{256}; +constexpr auto WARMUP_STEPS = 1; +constexpr auto STEPS = 5; +constexpr auto alpha = 3.14; + +static_assert(PROBLEM_SIZE % BLOCK_SIZE == 0); + +void daxpy(std::ofstream& plotFile) +{ + const auto* title = "baseline std::vector"; + std::cout << title << "\n"; + + Stopwatch watch; + auto x = std::vector(PROBLEM_SIZE); + auto y = std::vector(PROBLEM_SIZE); + auto z = std::vector(PROBLEM_SIZE); + watch.printAndReset("alloc"); + + for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + { + x[i] = static_cast(i); + y[i] = static_cast(i); + } + watch.printAndReset("init"); + + double sum = 0; + for(std::size_t s = 0; s < WARMUP_STEPS + STEPS; ++s) + { +#pragma omp parallel for + for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i++) + z[i] = alpha * x[i] + y[i]; + if(s < WARMUP_STEPS) + watch.printAndReset("daxpy (warmup)"); + else + sum += watch.printAndReset("daxpy"); + } + plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n'; +} + +template +inline constexpr bool isGPU = false; + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +template +inline constexpr bool isGPU> = true; +#endif + +template +void daxpy_alpaka_llama(std::string mappingName, std::ofstream& plotFile, Mapping mapping) +{ + std::size_t storageSize = 0; + for(std::size_t i = 0; i < mapping.blobCount; i++) + storageSize += mapping.blobSize(i); + + auto title = "alpaka/LLAMA " + std::move(mappingName); + fmt::print("{0} (blobs size: {1}MiB)\n", title, storageSize / 1024 / 1024); + + using Dim = alpaka::DimInt<1>; + using Size = std::size_t; + using Acc = alpaka::ExampleDefaultAcc; + using Dev = alpaka::Dev; + using Queue = alpaka::Queue; + const auto devAcc = alpaka::getDevByIdx>(0u); + const auto devHost = alpaka::getDevByIdx(0u); + auto queue = Queue(devAcc); + + Stopwatch watch; + auto x = llama::allocViewUninitialized(mapping); + auto y = llama::allocViewUninitialized(mapping); + auto z = llama::allocViewUninitialized(mapping); + watch.printAndReset("alloc host"); + + for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) + { + x[i] = static_cast(i); + y[i] = static_cast(i); + } + watch.printAndReset("init host"); + + static_assert(Mapping::blobCount == 1); // make our life simpler + const auto bufferSize = mapping.blobSize(0); + const auto extents = alpaka::Vec{bufferSize}; + auto bufferX = alpaka::allocBuf(devAcc, extents); + auto bufferY = alpaka::allocBuf(devAcc, extents); + auto bufferZ = alpaka::allocBuf(devAcc, extents); + watch.printAndReset("alloc device"); + + { + auto vx = alpaka::createView(devHost, &x.storageBlobs[0][0], extents); + auto vy = alpaka::createView(devHost, &y.storageBlobs[0][0], extents); + alpaka::memcpy(queue, bufferX, vx, extents); + alpaka::memcpy(queue, bufferY, vy, extents); + } + watch.printAndReset("copy H->D"); + + auto viewX = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferX)}}; + auto viewY = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferY)}}; + auto viewZ = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferZ)}}; + + constexpr auto blockSize = isGPU ? BLOCK_SIZE : 1; + const auto workdiv = alpaka::WorkDivMembers( + alpaka::Vec{PROBLEM_SIZE / blockSize}, + alpaka::Vec{blockSize}, + alpaka::Vec{Size{1}}); + watch = {}; + + double sum = 0; + for(std::size_t s = 0; s < WARMUP_STEPS + STEPS; ++s) + { + auto kernel + = [] ALPAKA_FN_ACC(const Acc& acc, decltype(viewX) x, decltype(viewY) y, double alpha, decltype(viewZ) z) + { + const auto [i] = alpaka::getIdx(acc); + z[i] = alpha * x[i] + y[i]; + }; + alpaka::exec(queue, workdiv, kernel, viewX, viewY, alpha, viewZ); + if(s < WARMUP_STEPS) + watch.printAndReset("daxpy (warmup)"); + else + sum += watch.printAndReset("daxpy"); + } + + { + auto vz = alpaka::createView(devHost, &z.storageBlobs[0][0], extents); + alpaka::memcpy(queue, vz, bufferZ, extents); + } + watch.printAndReset("copy D->H"); + + plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n'; +} + +auto main() -> int +try +{ + const auto numThreads = static_cast(omp_get_max_threads()); + const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) + affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; + + fmt::print( + R"({}Mi doubles ({}MiB data) +Threads: {} +Affinity: {} +)", + PROBLEM_SIZE / 1024 / 1024, + PROBLEM_SIZE * sizeof(double) / 1024 / 1024, + numThreads, + affinity); + + std::ofstream plotFile{"daxpy.sh"}; + plotFile.exceptions(std::ios::badbit | std::ios::failbit); + plotFile << fmt::format( + R"(#!/usr/bin/gnuplot -p +# threads: {} affinity: {} +set title "daxpy CPU {}Mi doubles on {}" +set style data histograms +set style fill solid +set xtics rotate by 45 right +set key off +set yrange [0:*] +set ylabel "runtime [s]" +$data << EOD +)", + numThreads, + affinity, + PROBLEM_SIZE / 1024 / 1024, + common::hostname()); + + daxpy(plotFile); + + const auto extents = llama::ArrayExtents{PROBLEM_SIZE}; + daxpy_alpaka_llama("AoS", plotFile, llama::mapping::AoS{extents, double{}}); + daxpy_alpaka_llama("SoA", plotFile, llama::mapping::SoA, double, false>{extents}); + daxpy_alpaka_llama( + "Bytesplit", + plotFile, + llama::mapping::Bytesplit, double, llama::mapping::BindAoS<>::fn>{extents}); + daxpy_alpaka_llama( + "ChangeType D->F", + plotFile, + llama::mapping::ChangeType< + llama::ArrayExtentsDynamic<1>, + double, + llama::mapping::BindAoS<>::fn, + boost::mp11::mp_list>>{extents}); + daxpy_alpaka_llama("Bitpack 52^{11}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 11, 52, double{}}); + daxpy_alpaka_llama( + "Bitpack 52^{11} CT", + plotFile, + llama::mapping:: + BitPackedFloatSoA, double, llama::Constant<11>, llama::Constant<52>>{ + extents}); + daxpy_alpaka_llama("Bitpack 23^{8}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 8, 23, double{}}); + daxpy_alpaka_llama( + "Bitpack 23^{8} CT", + plotFile, + llama::mapping:: + BitPackedFloatSoA, double, llama::Constant<8>, llama::Constant<23>>{ + extents}); + daxpy_alpaka_llama("Bitpack 10^{5}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 5, 10, double{}}); + daxpy_alpaka_llama( + "Bitpack 10^{5} CT", + plotFile, + llama::mapping:: + BitPackedFloatSoA, double, llama::Constant<5>, llama::Constant<10>>{ + extents}); + + plotFile << R"(EOD +plot $data using 2:xtic(1) +)"; + std::cout << "Plot with: ./daxpy.sh\n"; + + return 0; +} +catch(const std::exception& e) +{ + std::cerr << "Exception: " << e.what() << '\n'; +} diff --git a/examples/daxpy/daxpy.cpp b/examples/daxpy/daxpy.cpp deleted file mode 100644 index aa184ab7d8..0000000000 --- a/examples/daxpy/daxpy.cpp +++ /dev/null @@ -1,163 +0,0 @@ -#include "../common/Stopwatch.hpp" -#include "../common/hostname.hpp" - -#include -#include -#include -#include -#include -#include - -constexpr auto PROBLEM_SIZE = 1024 * 1024 * 128; -constexpr auto STEPS = 5; -constexpr auto alpha = 3.14; - -void daxpy(std::ofstream& plotFile) -{ - const auto* title = "std::vector"; - std::cout << title << "\n"; - - Stopwatch watch; - auto x = std::vector(PROBLEM_SIZE); - auto y = std::vector(PROBLEM_SIZE); - auto z = std::vector(PROBLEM_SIZE); - watch.printAndReset("alloc"); - - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) - { - x[i] = static_cast(i); - y[i] = static_cast(i); - } - watch.printAndReset("init"); - - double sum = 0; - for(std::size_t s = 0; s < STEPS; ++s) - { -#pragma omp parallel for - for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i++) - z[i] = alpha * x[i] + y[i]; - sum = watch.printAndReset("daxpy"); - } - plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n'; -} - -template -void daxpy_llama(std::string mappingName, std::ofstream& plotFile, Mapping mapping) -{ - std::size_t storageSize = 0; - for(std::size_t i = 0; i < mapping.blobCount; i++) - storageSize += mapping.blobSize(i); - - auto title = "LLAMA " + std::move(mappingName); - fmt::print("{0} (blobs size: {1}MiB)\n", title, storageSize / 1024 / 1024); - - Stopwatch watch; - auto x = llama::allocViewUninitialized(mapping); - auto y = llama::allocViewUninitialized(mapping); - auto z = llama::allocViewUninitialized(mapping); - watch.printAndReset("alloc"); - - for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) - { - x[i] = static_cast(i); - y[i] = static_cast(i); - } - watch.printAndReset("init"); - - double sum = 0; - for(std::size_t s = 0; s < STEPS; ++s) - { -#pragma omp parallel for - for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i++) - z[i] = alpha * x[i] + y[i]; - sum = watch.printAndReset("daxpy"); - } - plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n'; -} - -auto main() -> int -try -{ - const auto numThreads = static_cast(omp_get_max_threads()); - const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) - affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; - - fmt::print( - R"({}Mi doubles ({}MiB data) -Threads: {} -Affinity: {} -)", - PROBLEM_SIZE / 1024 / 1024, - PROBLEM_SIZE * sizeof(double) / 1024 / 1024, - numThreads, - affinity); - - std::ofstream plotFile{"daxpy.sh"}; - plotFile.exceptions(std::ios::badbit | std::ios::failbit); - plotFile << fmt::format( - R"(#!/usr/bin/gnuplot -p -# threads: {} affinity: {} -set title "daxpy CPU {}Mi doubles on {}" -set style data histograms -set style fill solid -set xtics rotate by 45 right -set key off -set yrange [0:*] -set ylabel "runtime [s]" -$data << EOD -)", - numThreads, - affinity, - PROBLEM_SIZE / 1024 / 1024, - common::hostname()); - - daxpy(plotFile); - - const auto extents = llama::ArrayExtents{PROBLEM_SIZE}; - daxpy_llama("AoS", plotFile, llama::mapping::AoS{extents, double{}}); - daxpy_llama("SoA", plotFile, llama::mapping::SoA{extents, double{}}); - daxpy_llama( - "Bytesplit", - plotFile, - llama::mapping::Bytesplit, double, llama::mapping::BindAoS<>::fn>{extents}); - daxpy_llama( - "ChangeType D->F", - plotFile, - llama::mapping::ChangeType< - llama::ArrayExtentsDynamic<1>, - double, - llama::mapping::BindAoS<>::fn, - boost::mp11::mp_list>>{extents}); - daxpy_llama("Bitpack 52^{11}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 11, 52, double{}}); - daxpy_llama( - "Bitpack 52^{11} CT", - plotFile, - llama::mapping:: - BitPackedFloatSoA, double, llama::Constant<11>, llama::Constant<52>>{ - extents}); - daxpy_llama("Bitpack 23^{8}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 8, 23, double{}}); - daxpy_llama( - "Bitpack 23^{8} CT", - plotFile, - llama::mapping:: - BitPackedFloatSoA, double, llama::Constant<8>, llama::Constant<23>>{ - extents}); - daxpy_llama("Bitpack 10^{5}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 5, 10, double{}}); - daxpy_llama( - "Bitpack 10^{5} CT", - plotFile, - llama::mapping:: - BitPackedFloatSoA, double, llama::Constant<5>, llama::Constant<10>>{ - extents}); - - plotFile << R"(EOD -plot $data using 2:xtic(1) -)"; - std::cout << "Plot with: ./daxpy.sh\n"; - - return 0; -} -catch(const std::exception& e) -{ - std::cerr << "Exception: " << e.what() << '\n'; -}