Skip to content

Commit

Permalink
convert daxpy example to alpaka
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Feb 11, 2022
1 parent bcc813d commit 30cfe97
Show file tree
Hide file tree
Showing 4 changed files with 235 additions and 168 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory("examples/bytesplit")
add_subdirectory("examples/bitpackint")
add_subdirectory("examples/bitpackfloat")
add_subdirectory("examples/daxpy")

# alpaka examples
find_package(alpaka 0.7.0 QUIET)
Expand All @@ -95,6 +94,7 @@ if (LLAMA_BUILD_EXAMPLES)
add_subdirectory("examples/alpaka/vectoradd")
add_subdirectory("examples/alpaka/asyncblur")
add_subdirectory("examples/alpaka/pic")
add_subdirectory("examples/alpaka/daxpy")
elseif()
message(WARNING "Could not find alpaka. Alpaka examples are disabled.")
endif()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
cmake_minimum_required (VERSION 3.15)
project(llama-daxpy CXX)
project(llama-alpaka-daxpy CXX)

#find_package(Vc QUIET)
find_package(OpenMP REQUIRED)
if (NOT TARGET llama::llama)
find_package(llama REQUIRED)
endif()

add_executable(${PROJECT_NAME} daxpy.cpp)
find_package(alpaka 0.7.0 REQUIRED)
alpaka_add_executable(${PROJECT_NAME} daxpy.cpp ../../common/Stopwatch.hpp ../../common/hostname.hpp)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama OpenMP::OpenMP_CXX)
target_link_libraries(${PROJECT_NAME} PRIVATE llama::llama OpenMP::OpenMP_CXX alpaka::alpaka)

if (MSVC)
target_compile_options(${PROJECT_NAME} PRIVATE /arch:AVX2 /fp:fast)
Expand Down
230 changes: 230 additions & 0 deletions examples/alpaka/daxpy/daxpy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
#include "../../common/Stopwatch.hpp"
#include "../../common/hostname.hpp"

#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExampleDefaultAcc.hpp>
#include <fmt/core.h>
#include <fstream>
#include <iomanip>
#include <llama/llama.hpp>
#include <omp.h>
#include <vector>

constexpr auto PROBLEM_SIZE = std::size_t{1024 * 1024 * 128};
constexpr auto BLOCK_SIZE = std::size_t{256};
constexpr auto WARMUP_STEPS = 1;
constexpr auto STEPS = 5;
constexpr auto alpha = 3.14;

static_assert(PROBLEM_SIZE % BLOCK_SIZE == 0);

void daxpy(std::ofstream& plotFile)
{
const auto* title = "baseline std::vector";
std::cout << title << "\n";

Stopwatch watch;
auto x = std::vector<double>(PROBLEM_SIZE);
auto y = std::vector<double>(PROBLEM_SIZE);
auto z = std::vector<double>(PROBLEM_SIZE);
watch.printAndReset("alloc");

for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
{
x[i] = static_cast<double>(i);
y[i] = static_cast<double>(i);
}
watch.printAndReset("init");

double sum = 0;
for(std::size_t s = 0; s < WARMUP_STEPS + STEPS; ++s)
{
#pragma omp parallel for
for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i++)
z[i] = alpha * x[i] + y[i];
if(s < WARMUP_STEPS)
watch.printAndReset("daxpy (warmup)");
else
sum += watch.printAndReset("daxpy");
}
plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n';
}

template<typename Acc>
inline constexpr bool isGPU = false;

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
template<typename Dim, typename Idx>
inline constexpr bool isGPU<alpaka::AccGpuCudaRt<Dim, Idx>> = true;
#endif

template<typename Mapping>
void daxpy_alpaka_llama(std::string mappingName, std::ofstream& plotFile, Mapping mapping)
{
std::size_t storageSize = 0;
for(std::size_t i = 0; i < mapping.blobCount; i++)
storageSize += mapping.blobSize(i);

auto title = "alpaka/LLAMA " + std::move(mappingName);
fmt::print("{0} (blobs size: {1}MiB)\n", title, storageSize / 1024 / 1024);

using Dim = alpaka::DimInt<1>;
using Size = std::size_t;
using Acc = alpaka::ExampleDefaultAcc<Dim, Size>;
using Dev = alpaka::Dev<Acc>;
using Queue = alpaka::Queue<Dev, alpaka::Blocking>;
const auto devAcc = alpaka::getDevByIdx<alpaka::Pltf<Dev>>(0u);
const auto devHost = alpaka::getDevByIdx<alpaka::PltfCpu>(0u);
auto queue = Queue(devAcc);

Stopwatch watch;
auto x = llama::allocViewUninitialized(mapping);
auto y = llama::allocViewUninitialized(mapping);
auto z = llama::allocViewUninitialized(mapping);
watch.printAndReset("alloc host");

for(std::size_t i = 0; i < PROBLEM_SIZE; ++i)
{
x[i] = static_cast<double>(i);
y[i] = static_cast<double>(i);
}
watch.printAndReset("init host");

static_assert(Mapping::blobCount == 1); // make our life simpler
const auto bufferSize = mapping.blobSize(0);
const auto extents = alpaka::Vec<Dim, Size>{bufferSize};
auto bufferX = alpaka::allocBuf<std::byte, Size>(devAcc, extents);
auto bufferY = alpaka::allocBuf<std::byte, Size>(devAcc, extents);
auto bufferZ = alpaka::allocBuf<std::byte, Size>(devAcc, extents);
watch.printAndReset("alloc device");

{
auto vx = alpaka::createView(devHost, &x.storageBlobs[0][0], extents);
auto vy = alpaka::createView(devHost, &y.storageBlobs[0][0], extents);
alpaka::memcpy(queue, bufferX, vx, extents);
alpaka::memcpy(queue, bufferY, vy, extents);
}
watch.printAndReset("copy H->D");

auto viewX = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferX)}};
auto viewY = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferY)}};
auto viewZ = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferZ)}};

constexpr auto blockSize = isGPU<Acc> ? BLOCK_SIZE : 1;
const auto workdiv = alpaka::WorkDivMembers<Dim, Size>(
alpaka::Vec<Dim, Size>{PROBLEM_SIZE / blockSize},
alpaka::Vec<Dim, Size>{blockSize},
alpaka::Vec<Dim, Size>{Size{1}});
watch = {};

double sum = 0;
for(std::size_t s = 0; s < WARMUP_STEPS + STEPS; ++s)
{
auto kernel
= [] ALPAKA_FN_ACC(const Acc& acc, decltype(viewX) x, decltype(viewY) y, double alpha, decltype(viewZ) z)
{
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
z[i] = alpha * x[i] + y[i];
};
alpaka::exec<Acc>(queue, workdiv, kernel, viewX, viewY, alpha, viewZ);
if(s < WARMUP_STEPS)
watch.printAndReset("daxpy (warmup)");
else
sum += watch.printAndReset("daxpy");
}

{
auto vz = alpaka::createView(devHost, &z.storageBlobs[0][0], extents);
alpaka::memcpy(queue, vz, bufferZ, extents);
}
watch.printAndReset("copy D->H");

plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n';
}

auto main() -> int
try
{
const auto numThreads = static_cast<std::size_t>(omp_get_max_threads());
const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe)
affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity;

fmt::print(
R"({}Mi doubles ({}MiB data)
Threads: {}
Affinity: {}
)",
PROBLEM_SIZE / 1024 / 1024,
PROBLEM_SIZE * sizeof(double) / 1024 / 1024,
numThreads,
affinity);

std::ofstream plotFile{"daxpy.sh"};
plotFile.exceptions(std::ios::badbit | std::ios::failbit);
plotFile << fmt::format(
R"(#!/usr/bin/gnuplot -p
# threads: {} affinity: {}
set title "daxpy CPU {}Mi doubles on {}"
set style data histograms
set style fill solid
set xtics rotate by 45 right
set key off
set yrange [0:*]
set ylabel "runtime [s]"
$data << EOD
)",
numThreads,
affinity,
PROBLEM_SIZE / 1024 / 1024,
common::hostname());

daxpy(plotFile);

const auto extents = llama::ArrayExtents{PROBLEM_SIZE};
daxpy_alpaka_llama("AoS", plotFile, llama::mapping::AoS{extents, double{}});
daxpy_alpaka_llama("SoA", plotFile, llama::mapping::SoA<llama::ArrayExtentsDynamic<1>, double, false>{extents});
daxpy_alpaka_llama(
"Bytesplit",
plotFile,
llama::mapping::Bytesplit<llama::ArrayExtentsDynamic<1>, double, llama::mapping::BindAoS<>::fn>{extents});
daxpy_alpaka_llama(
"ChangeType D->F",
plotFile,
llama::mapping::ChangeType<
llama::ArrayExtentsDynamic<1>,
double,
llama::mapping::BindAoS<>::fn,
boost::mp11::mp_list<boost::mp11::mp_list<double, float>>>{extents});
daxpy_alpaka_llama("Bitpack 52^{11}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 11, 52, double{}});
daxpy_alpaka_llama(
"Bitpack 52^{11} CT",
plotFile,
llama::mapping::
BitPackedFloatSoA<llama::ArrayExtentsDynamic<1>, double, llama::Constant<11>, llama::Constant<52>>{
extents});
daxpy_alpaka_llama("Bitpack 23^{8}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 8, 23, double{}});
daxpy_alpaka_llama(
"Bitpack 23^{8} CT",
plotFile,
llama::mapping::
BitPackedFloatSoA<llama::ArrayExtentsDynamic<1>, double, llama::Constant<8>, llama::Constant<23>>{
extents});
daxpy_alpaka_llama("Bitpack 10^{5}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 5, 10, double{}});
daxpy_alpaka_llama(
"Bitpack 10^{5} CT",
plotFile,
llama::mapping::
BitPackedFloatSoA<llama::ArrayExtentsDynamic<1>, double, llama::Constant<5>, llama::Constant<10>>{
extents});

plotFile << R"(EOD
plot $data using 2:xtic(1)
)";
std::cout << "Plot with: ./daxpy.sh\n";

return 0;
}
catch(const std::exception& e)
{
std::cerr << "Exception: " << e.what() << '\n';
}
Loading

0 comments on commit 30cfe97

Please sign in to comment.