Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix building alpaka examples #527

Merged
merged 4 commits into from
Jun 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$CONFIG -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -DALPAKA_ACC_CPU_DISABLE_ATOMIC_REF=ON -DALPAKA_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$CONFIG -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -Dalpaka_ACC_CPU_DISABLE_ATOMIC_REF=ON -Dalpaka_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake
sed -i 's/\(-forward-unknown-to-host-compiler\|--generate-code=arch=[^ ]\+\|--expt-extended-lambda\|--expt-relaxed-constexpr\|--use_fast_math\)//g' compile_commands.json # remove NVCC specific flags which clang cannot handle
run-clang-tidy-14 -header-filter='^((?!/thirdparty/).)*$' -extra-arg=--no-cuda-version-check -extra-arg=-nocudalib -extra-arg=-Wno-unused-command-line-argument '^(?!.*'$PWD').*$'

Expand Down Expand Up @@ -243,7 +243,7 @@ jobs:
unset CUDACXX
fi
echo "nvcc is here: $CUDACXX"
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -DLLAMA_ENABLE_ASAN_FOR_TESTS=ON -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${{ !matrix.cuda }} -DALPAKA_ACC_CPU_DISABLE_ATOMIC_REF=ON -DALPAKA_ACC_GPU_CUDA_ENABLE=${{ matrix.cuda }} -DALPAKA_CXX_STANDARD=17 -DCMAKE_CUDA_COMPILER=$CUDACXX -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -DLLAMA_ENABLE_ASAN_FOR_TESTS=ON -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${{ !matrix.cuda }} -Dalpaka_ACC_CPU_DISABLE_ATOMIC_REF=ON -Dalpaka_ACC_GPU_CUDA_ENABLE=${{ matrix.cuda }} -Dalpaka_CXX_STANDARD=17 -DCMAKE_CUDA_COMPILER=$CUDACXX -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake
- name: build tests + examples
run: |
if [ ${{ matrix.install_oneapi }} ]; then source /opt/intel/oneapi/setvars.sh; fi
Expand Down Expand Up @@ -283,7 +283,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON "-DCMAKE_TOOLCHAIN_FILE=$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake"
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON "-DCMAKE_TOOLCHAIN_FILE=$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake"
- name: build tests + examples
run: cmake --build build -j $env:THREADS --config $env:CONFIG
- name: run tests
Expand Down Expand Up @@ -320,7 +320,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -DALPAKA_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake
cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -Dalpaka_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake
- name: build tests + examples
run: |
cmake --build build -j $THREADS
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ if (LLAMA_BUILD_EXAMPLES)

# alpaka examples
find_package(alpaka 0.9.0 QUIET)
if (_ALPAKA_FOUND)
if (_alpaka_FOUND)
add_subdirectory("examples/alpaka/nbody")
add_subdirectory("examples/alpaka/vectoradd")
add_subdirectory("examples/alpaka/asyncblur")
add_subdirectory("examples/alpaka/pic")
add_subdirectory("examples/alpaka/daxpy")
elseif()
else()
message(WARNING "Could not find alpaka. Alpaka examples are disabled.")
endif()

Expand Down
114 changes: 56 additions & 58 deletions examples/alpaka/asyncblur/asyncblur.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ using PixelOnAcc = llama::Record<
/** Alpaka kernel functor used to blur a small image living in the device memory
* using the \ref PixelOnAcc record dimension
*/
template<std::size_t Elems, std::size_t KernelSize, std::size_t ElemsPerBlock>
template<int Elems, int KernelSize, int ElemsPerBlock>
struct BlurKernel
{
template<typename Acc, typename View>
Expand All @@ -85,10 +85,8 @@ struct BlurKernel
{
// Using SoA for the shared memory
constexpr auto sharedChunkSize = ElemsPerBlock + 2 * KernelSize;
constexpr auto sharedMapping = llama::mapping::SoA<
llama::ArrayExtents<std::size_t, sharedChunkSize, sharedChunkSize>,
typename View::RecordDim,
false>{};
constexpr auto sharedMapping = llama::mapping::
SoA<llama::ArrayExtents<int, sharedChunkSize, sharedChunkSize>, typename View::RecordDim, false>{};
auto& sharedMem = alpaka::declareSharedVar<std::byte[sharedMapping.blobSize(0)], __COUNTER__>(acc);
return llama::View(sharedMapping, llama::Array<std::byte*, 1>{&sharedMem[0]});
}
Expand All @@ -102,9 +100,9 @@ struct BlurKernel
constexpr auto threadsPerBlock = ElemsPerBlock / Elems;
const auto threadIdxInBlock = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);

const std::size_t bStart[2]
const int bStart[2]
= {bi[0] * ElemsPerBlock + threadIdxInBlock[0], bi[1] * ElemsPerBlock + threadIdxInBlock[1]};
const std::size_t bEnd[2] = {
const int bEnd[2] = {
alpaka::math::min(acc, bStart[0] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping().extents()[0]),
alpaka::math::min(acc, bStart[1] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping().extents()[1]),
};
Expand All @@ -117,8 +115,8 @@ struct BlurKernel
alpaka::syncBlockThreads(acc);
}

const std::size_t start[2] = {ti[0] * Elems, ti[1] * Elems};
const std::size_t end[2] = {
const int start[2] = {ti[0] * Elems, ti[1] * Elems};
const int end[2] = {
alpaka::math::min(acc, start[0] + Elems, oldImage.mapping().extents()[0] - 2 * KernelSize),
alpaka::math::min(acc, start[1] + Elems, oldImage.mapping().extents()[1] - 2 * KernelSize),
};
Expand All @@ -143,9 +141,9 @@ struct BlurKernel
for(auto a = iAStart; a < i_a_end; ++a)
{
if constexpr(SHARED)
sum += sharedView(std::size_t(b), std::size_t(a));
sum += sharedView(b, a);
else
sum += oldImage(std::size_t(b), std::size_t(a));
sum += oldImage(b, a);
}
sum /= FP((2 * KernelSize + 1) * (2 * KernelSize + 1));
newImage(y + KernelSize, x + KernelSize) = sum;
Expand All @@ -159,7 +157,7 @@ try
// ALPAKA
using Dim = alpaka::DimInt<2>;

using Acc = alpaka::ExampleDefaultAcc<Dim, std::size_t>;
using Acc = alpaka::ExampleDefaultAcc<Dim, int>;
// using Acc = alpaka::AccGpuCudaRt<Dim, Size>;
// using Acc = alpaka::AccCpuSerial<Dim, Size>;

Expand All @@ -171,19 +169,19 @@ try
const DevAcc devAcc = alpaka::getDevByIdx<PltfAcc>(0);
const DevHost devHost = alpaka::getDevByIdx<PltfHost>(0);
std::vector<Queue> queue;
for(std::size_t i = 0; i < CHUNK_COUNT; ++i)
for(int i = 0; i < CHUNK_COUNT; ++i)
queue.emplace_back(devAcc);

// ASYNCCOPY
std::size_t img_x = DEFAULT_IMG_X;
std::size_t img_y = DEFAULT_IMG_Y;
std::size_t buffer_x = DEFAULT_IMG_X + 2 * KERNEL_SIZE;
std::size_t buffer_y = DEFAULT_IMG_Y + 2 * KERNEL_SIZE;
int img_x = DEFAULT_IMG_X;
int img_y = DEFAULT_IMG_Y;
int buffer_x = DEFAULT_IMG_X + 2 * KERNEL_SIZE;
int buffer_y = DEFAULT_IMG_Y + 2 * KERNEL_SIZE;

constexpr std::size_t hardwareThreads = 2; // relevant for OpenMP2Threads
constexpr int hardwareThreads = 2; // relevant for OpenMP2Threads
using Distribution = common::ThreadsElemsDistribution<Acc, ELEMS_PER_BLOCK, hardwareThreads>;
constexpr std::size_t elemCount = Distribution::elemCount;
constexpr std::size_t threadCount = Distribution::threadCount;
constexpr int elemCount = Distribution::elemCount;
constexpr int threadCount = Distribution::threadCount;

std::vector<unsigned char> image;
std::string out_filename = "output.png";
Expand All @@ -207,13 +205,13 @@ try
}

// LLAMA
using ArrayIndex = llama::ArrayIndex<2>;
using ArrayIndex = llama::ArrayIndex<int, 2>;

auto treeOperationList = llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()};
const auto hostMapping
= llama::mapping::tree::Mapping{llama::ArrayExtents{buffer_y, buffer_x}, treeOperationList, Pixel{}};
const auto devMapping = llama::mapping::tree::Mapping{
llama::ArrayExtents<std::size_t, CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
llama::ArrayExtents<int, CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
treeOperationList,
PixelOnAcc{}};
using DevMapping = std::decay_t<decltype(devMapping)>;
Expand All @@ -226,26 +224,26 @@ try

Stopwatch chrono;

auto hostBuffer = alpaka::allocBuf<std::byte, std::size_t>(devHost, hostBufferSize);
auto hostBuffer = alpaka::allocBuf<std::byte, int>(devHost, hostBufferSize);
auto hostView = viewAlpakaBuffer(hostMapping, hostBuffer);

std::vector<alpaka::Buf<DevHost, std::byte, alpaka::DimInt<1>, std::size_t>> hostChunkBuffer;
std::vector<alpaka::Buf<DevHost, std::byte, alpaka::DimInt<1>, int>> hostChunkBuffer;
std::vector<llama::View<DevMapping, std::byte*>> hostChunkView;

std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, std::size_t>> devOldBuffer;
std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, std::size_t>> devNewBuffer;
std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, int>> devOldBuffer;
std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, int>> devNewBuffer;
std::vector<llama::View<DevMapping, std::byte*>> devOldView;
std::vector<llama::View<DevMapping, std::byte*>> devNewView;

for(std::size_t i = 0; i < CHUNK_COUNT; ++i)
for(int i = 0; i < CHUNK_COUNT; ++i)
{
hostChunkBuffer.push_back(alpaka::allocBuf<std::byte, std::size_t>(devHost, devBufferSize));
hostChunkBuffer.push_back(alpaka::allocBuf<std::byte, int>(devHost, devBufferSize));
hostChunkView.push_back(viewAlpakaBuffer(devMapping, hostChunkBuffer.back()));

devOldBuffer.push_back(alpaka::allocBuf<std::byte, std::size_t>(devAcc, devBufferSize));
devOldBuffer.push_back(alpaka::allocBuf<std::byte, int>(devAcc, devBufferSize));
devOldView.push_back(viewAlpakaBuffer(devMapping, devOldBuffer.back()));

devNewBuffer.push_back(alpaka::allocBuf<std::byte, std::size_t>(devAcc, devBufferSize));
devNewBuffer.push_back(alpaka::allocBuf<std::byte, int>(devAcc, devBufferSize));
devNewView.push_back(viewAlpakaBuffer(devMapping, devNewBuffer.back()));
}

Expand All @@ -256,10 +254,10 @@ try
image.resize(img_x * img_y * 3);
std::default_random_engine generator;
std::normal_distribution<FP> distribution{FP(0), FP(0.5)};
for(std::size_t y = 0; y < buffer_y; ++y)
for(int y = 0; y < buffer_y; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < buffer_x; ++x)
for(int x = 0; x < buffer_x; ++x)
{
hostView(y, x)(tag::R()) = std::abs(distribution(generator));
hostView(y, x)(tag::G()) = std::abs(distribution(generator));
Expand All @@ -269,13 +267,13 @@ try
}
else
{
for(std::size_t y = 0; y < buffer_y; ++y)
for(int y = 0; y < buffer_y; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < buffer_x; ++x)
for(int x = 0; x < buffer_x; ++x)
{
const auto X = std::clamp<std::size_t>(x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1);
const auto Y = std::clamp<std::size_t>(y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1);
const auto X = std::clamp<int>(x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1);
const auto Y = std::clamp<int>(y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1);
const auto* pixel = &image[((Y - KERNEL_SIZE) * img_x + X - KERNEL_SIZE) * 3];
hostView(y, x)(tag::R()) = FP(pixel[0]) / 255;
hostView(y, x)(tag::G()) = FP(pixel[1]) / 255;
Expand All @@ -285,25 +283,25 @@ try
}

chrono.printAndReset("Init");
const auto elems = alpaka::Vec<Dim, size_t>(elemCount, elemCount);
const auto threads = alpaka::Vec<Dim, size_t>(threadCount, threadCount);
const auto blocks = alpaka::Vec<Dim, size_t>(
static_cast<size_t>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK),
static_cast<size_t>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK));
const alpaka::Vec<Dim, size_t> chunks(
static_cast<size_t>((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE),
static_cast<size_t>((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE));
const auto elems = alpaka::Vec<Dim, int>(elemCount, elemCount);
const auto threads = alpaka::Vec<Dim, int>(threadCount, threadCount);
const auto blocks = alpaka::Vec<Dim, int>(
static_cast<int>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK),
static_cast<int>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK));
const alpaka::Vec<Dim, int> chunks(
static_cast<int>((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE),
static_cast<int>((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE));

const auto workdiv = alpaka::WorkDivMembers<Dim, size_t>{blocks, threads, elems};
const auto workdiv = alpaka::WorkDivMembers<Dim, int>{blocks, threads, elems};

struct VirtualHostElement
{
llama::VirtualView<decltype(hostView)&> virtualHost;
const llama::ArrayExtentsDynamic<2> validMiniSize;
const llama::ArrayExtentsDynamic<int, 2> validMiniSize;
};
std::list<VirtualHostElement> virtualHostList;
for(std::size_t chunk_y = 0; chunk_y < chunks[0]; ++chunk_y)
for(std::size_t chunk_x = 0; chunk_x < chunks[1]; ++chunk_x)
for(int chunk_y = 0; chunk_y < chunks[0]; ++chunk_y)
for(int chunk_x = 0; chunk_x < chunks[1]; ++chunk_x)
{
// Create virtual view with size of mini view
const auto validMiniSize = llama::ArrayExtents{
Expand All @@ -312,7 +310,7 @@ try
llama::VirtualView virtualHost(hostView, {chunk_y * CHUNK_SIZE, chunk_x * CHUNK_SIZE});

// Find free chunk stream
std::size_t chunkNr = virtualHostList.size();
int chunkNr = virtualHostList.size();
if(virtualHostList.size() < CHUNK_COUNT)
virtualHostList.push_back({virtualHost, validMiniSize});
else
Expand All @@ -327,10 +325,10 @@ try
{
// Copy data back
LLAMA_INDEPENDENT_DATA
for(std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
for(int y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
for(int x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE)
= hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE);
}
Expand All @@ -347,10 +345,10 @@ try
}

// Copy data from virtual view to mini view
for(std::size_t y = 0; y < validMiniSize[0]; ++y)
for(int y = 0; y < validMiniSize[0]; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < validMiniSize[1]; ++x)
for(int x = 0; x < validMiniSize[1]; ++x)
hostChunkView[chunkNr](y, x) = virtualHost(y, x);
}
alpaka::memcpy(queue[chunkNr], devOldBuffer[chunkNr], hostChunkBuffer[chunkNr], devBufferSize);
Expand All @@ -367,14 +365,14 @@ try

// Wait for not finished tasks on accelerator
auto chunkIt = virtualHostList.begin();
for(std::size_t chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr)
for(int chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr)
{
alpaka::wait(queue[chunkNr]);
// Copy data back
for(std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
for(int y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
for(int x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE)
= hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE);
}
Expand All @@ -384,10 +382,10 @@ try

if(SAVE)
{
for(std::size_t y = 0; y < img_y; ++y)
for(int y = 0; y < img_y; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < img_x; ++x)
for(int x = 0; x < img_x; ++x)
{
auto* pixel = &image[(y * img_x + x) * 3];
pixel[0] = static_cast<unsigned char>(hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::R()) * 255.);
Expand Down
Loading