-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c441afc
commit ff18b95
Showing
4 changed files
with
235 additions
and
168 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8 changes: 4 additions & 4 deletions
8
examples/daxpy/CMakeLists.txt → examples/alpaka/daxpy/CMakeLists.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
#include "../../common/Stopwatch.hpp" | ||
#include "../../common/hostname.hpp" | ||
|
||
#include <alpaka/alpaka.hpp> | ||
#include <alpaka/example/ExampleDefaultAcc.hpp> | ||
#include <fmt/core.h> | ||
#include <fstream> | ||
#include <iomanip> | ||
#include <llama/llama.hpp> | ||
#include <omp.h> | ||
#include <vector> | ||
|
||
constexpr auto PROBLEM_SIZE = std::size_t{1024 * 1024 * 128}; | ||
constexpr auto BLOCK_SIZE = std::size_t{256}; | ||
constexpr auto WARMUP_STEPS = 1; | ||
constexpr auto STEPS = 5; | ||
constexpr auto alpha = 3.14; | ||
|
||
static_assert(PROBLEM_SIZE % BLOCK_SIZE == 0); | ||
|
||
void daxpy(std::ofstream& plotFile) | ||
{ | ||
const auto* title = "baseline std::vector"; | ||
std::cout << title << "\n"; | ||
|
||
Stopwatch watch; | ||
auto x = std::vector<double>(PROBLEM_SIZE); | ||
auto y = std::vector<double>(PROBLEM_SIZE); | ||
auto z = std::vector<double>(PROBLEM_SIZE); | ||
watch.printAndReset("alloc"); | ||
|
||
for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) | ||
{ | ||
x[i] = static_cast<double>(i); | ||
y[i] = static_cast<double>(i); | ||
} | ||
watch.printAndReset("init"); | ||
|
||
double sum = 0; | ||
for(std::size_t s = 0; s < WARMUP_STEPS + STEPS; ++s) | ||
{ | ||
#pragma omp parallel for | ||
for(std::ptrdiff_t i = 0; i < PROBLEM_SIZE; i++) | ||
z[i] = alpha * x[i] + y[i]; | ||
if(s < WARMUP_STEPS) | ||
watch.printAndReset("daxpy (warmup)"); | ||
else | ||
sum += watch.printAndReset("daxpy"); | ||
} | ||
plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n'; | ||
} | ||
|
||
template<typename Acc> | ||
inline constexpr bool isGPU = false; | ||
|
||
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED | ||
template<typename Dim, typename Idx> | ||
inline constexpr bool isGPU<alpaka::AccGpuCudaRt<Dim, Idx>> = true; | ||
#endif | ||
|
||
template<typename Mapping> | ||
void daxpy_alpaka_llama(std::string mappingName, std::ofstream& plotFile, Mapping mapping) | ||
{ | ||
std::size_t storageSize = 0; | ||
for(std::size_t i = 0; i < mapping.blobCount; i++) | ||
storageSize += mapping.blobSize(i); | ||
|
||
auto title = "alpaka/LLAMA " + std::move(mappingName); | ||
fmt::print("{0} (blobs size: {1}MiB)\n", title, storageSize / 1024 / 1024); | ||
|
||
using Dim = alpaka::DimInt<1>; | ||
using Size = std::size_t; | ||
using Acc = alpaka::ExampleDefaultAcc<Dim, Size>; | ||
using Dev = alpaka::Dev<Acc>; | ||
using Queue = alpaka::Queue<Dev, alpaka::Blocking>; | ||
const auto devAcc = alpaka::getDevByIdx<alpaka::Pltf<Dev>>(0u); | ||
const auto devHost = alpaka::getDevByIdx<alpaka::PltfCpu>(0u); | ||
auto queue = Queue(devAcc); | ||
|
||
Stopwatch watch; | ||
auto x = llama::allocViewUninitialized(mapping); | ||
auto y = llama::allocViewUninitialized(mapping); | ||
auto z = llama::allocViewUninitialized(mapping); | ||
watch.printAndReset("alloc host"); | ||
|
||
for(std::size_t i = 0; i < PROBLEM_SIZE; ++i) | ||
{ | ||
x[i] = static_cast<double>(i); | ||
y[i] = static_cast<double>(i); | ||
} | ||
watch.printAndReset("init host"); | ||
|
||
static_assert(Mapping::blobCount == 1); // make our life simpler | ||
const auto bufferSize = mapping.blobSize(0); | ||
const auto extents = alpaka::Vec<Dim, Size>{bufferSize}; | ||
auto bufferX = alpaka::allocBuf<std::byte, Size>(devAcc, extents); | ||
auto bufferY = alpaka::allocBuf<std::byte, Size>(devAcc, extents); | ||
auto bufferZ = alpaka::allocBuf<std::byte, Size>(devAcc, extents); | ||
watch.printAndReset("alloc device"); | ||
|
||
{ | ||
auto vx = alpaka::createView(devHost, &x.storageBlobs[0][0], extents); | ||
auto vy = alpaka::createView(devHost, &y.storageBlobs[0][0], extents); | ||
alpaka::memcpy(queue, bufferX, vx, extents); | ||
alpaka::memcpy(queue, bufferY, vy, extents); | ||
} | ||
watch.printAndReset("copy H->D"); | ||
|
||
auto viewX = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferX)}}; | ||
auto viewY = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferY)}}; | ||
auto viewZ = llama::View{mapping, llama::Array{alpaka::getPtrNative(bufferZ)}}; | ||
|
||
constexpr auto blockSize = isGPU<Acc> ? BLOCK_SIZE : 1; | ||
const auto workdiv = alpaka::WorkDivMembers<Dim, Size>( | ||
alpaka::Vec<Dim, Size>{PROBLEM_SIZE / blockSize}, | ||
alpaka::Vec<Dim, Size>{blockSize}, | ||
alpaka::Vec<Dim, Size>{Size{1}}); | ||
watch = {}; | ||
|
||
double sum = 0; | ||
for(std::size_t s = 0; s < WARMUP_STEPS + STEPS; ++s) | ||
{ | ||
auto kernel | ||
= [] ALPAKA_FN_ACC(const Acc& acc, decltype(viewX) x, decltype(viewY) y, double alpha, decltype(viewZ) z) | ||
{ | ||
const auto [i] = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc); | ||
z[i] = alpha * x[i] + y[i]; | ||
}; | ||
alpaka::exec<Acc>(queue, workdiv, kernel, viewX, viewY, alpha, viewZ); | ||
if(s < WARMUP_STEPS) | ||
watch.printAndReset("daxpy (warmup)"); | ||
else | ||
sum += watch.printAndReset("daxpy"); | ||
} | ||
|
||
{ | ||
auto vz = alpaka::createView(devHost, &z.storageBlobs[0][0], extents); | ||
alpaka::memcpy(queue, vz, bufferZ, extents); | ||
} | ||
watch.printAndReset("copy D->H"); | ||
|
||
plotFile << std::quoted(title) << "\t" << sum / STEPS << '\n'; | ||
} | ||
|
||
auto main() -> int | ||
try | ||
{ | ||
const auto numThreads = static_cast<std::size_t>(omp_get_max_threads()); | ||
const char* affinity = std::getenv("GOMP_CPU_AFFINITY"); // NOLINT(concurrency-mt-unsafe) | ||
affinity = affinity == nullptr ? "NONE - PLEASE PIN YOUR THREADS!" : affinity; | ||
|
||
fmt::print( | ||
R"({}Mi doubles ({}MiB data) | ||
Threads: {} | ||
Affinity: {} | ||
)", | ||
PROBLEM_SIZE / 1024 / 1024, | ||
PROBLEM_SIZE * sizeof(double) / 1024 / 1024, | ||
numThreads, | ||
affinity); | ||
|
||
std::ofstream plotFile{"daxpy.sh"}; | ||
plotFile.exceptions(std::ios::badbit | std::ios::failbit); | ||
plotFile << fmt::format( | ||
R"(#!/usr/bin/gnuplot -p | ||
# threads: {} affinity: {} | ||
set title "daxpy CPU {}Mi doubles on {}" | ||
set style data histograms | ||
set style fill solid | ||
set xtics rotate by 45 right | ||
set key off | ||
set yrange [0:*] | ||
set ylabel "runtime [s]" | ||
$data << EOD | ||
)", | ||
numThreads, | ||
affinity, | ||
PROBLEM_SIZE / 1024 / 1024, | ||
common::hostname()); | ||
|
||
daxpy(plotFile); | ||
|
||
const auto extents = llama::ArrayExtents{PROBLEM_SIZE}; | ||
daxpy_alpaka_llama("AoS", plotFile, llama::mapping::AoS{extents, double{}}); | ||
daxpy_alpaka_llama("SoA", plotFile, llama::mapping::SoA<llama::ArrayExtentsDynamic<1>, double, false>{extents}); | ||
daxpy_alpaka_llama( | ||
"Bytesplit", | ||
plotFile, | ||
llama::mapping::Bytesplit<llama::ArrayExtentsDynamic<1>, double, llama::mapping::BindAoS<>::fn>{extents}); | ||
daxpy_alpaka_llama( | ||
"ChangeType D->F", | ||
plotFile, | ||
llama::mapping::ChangeType< | ||
llama::ArrayExtentsDynamic<1>, | ||
double, | ||
llama::mapping::BindAoS<>::fn, | ||
boost::mp11::mp_list<boost::mp11::mp_list<double, float>>>{extents}); | ||
daxpy_alpaka_llama("Bitpack 52^{11}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 11, 52, double{}}); | ||
daxpy_alpaka_llama( | ||
"Bitpack 52^{11} CT", | ||
plotFile, | ||
llama::mapping:: | ||
BitPackedFloatSoA<llama::ArrayExtentsDynamic<1>, double, llama::Constant<11>, llama::Constant<52>>{ | ||
extents}); | ||
daxpy_alpaka_llama("Bitpack 23^{8}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 8, 23, double{}}); | ||
daxpy_alpaka_llama( | ||
"Bitpack 23^{8} CT", | ||
plotFile, | ||
llama::mapping:: | ||
BitPackedFloatSoA<llama::ArrayExtentsDynamic<1>, double, llama::Constant<8>, llama::Constant<23>>{ | ||
extents}); | ||
daxpy_alpaka_llama("Bitpack 10^{5}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 5, 10, double{}}); | ||
daxpy_alpaka_llama( | ||
"Bitpack 10^{5} CT", | ||
plotFile, | ||
llama::mapping:: | ||
BitPackedFloatSoA<llama::ArrayExtentsDynamic<1>, double, llama::Constant<5>, llama::Constant<10>>{ | ||
extents}); | ||
|
||
plotFile << R"(EOD | ||
plot $data using 2:xtic(1) | ||
)"; | ||
std::cout << "Plot with: ./daxpy.sh\n"; | ||
|
||
return 0; | ||
} | ||
catch(const std::exception& e) | ||
{ | ||
std::cerr << "Exception: " << e.what() << '\n'; | ||
} |
Oops, something went wrong.