Skip to content

Commit

Permalink
Convert indices in asyncblur to int
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber committed Jun 13, 2022
1 parent f7112fe commit c2697f0
Showing 1 changed file with 56 additions and 58 deletions.
114 changes: 56 additions & 58 deletions examples/alpaka/asyncblur/asyncblur.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ using PixelOnAcc = llama::Record<
/** Alpaka kernel functor used to blur a small image living in the device memory
* using the \ref PixelOnAcc record dimension
*/
template<std::size_t Elems, std::size_t KernelSize, std::size_t ElemsPerBlock>
template<int Elems, int KernelSize, int ElemsPerBlock>
struct BlurKernel
{
template<typename Acc, typename View>
Expand All @@ -85,10 +85,8 @@ struct BlurKernel
{
// Using SoA for the shared memory
constexpr auto sharedChunkSize = ElemsPerBlock + 2 * KernelSize;
constexpr auto sharedMapping = llama::mapping::SoA<
llama::ArrayExtents<std::size_t, sharedChunkSize, sharedChunkSize>,
typename View::RecordDim,
false>{};
constexpr auto sharedMapping = llama::mapping::
SoA<llama::ArrayExtents<int, sharedChunkSize, sharedChunkSize>, typename View::RecordDim, false>{};
auto& sharedMem = alpaka::declareSharedVar<std::byte[sharedMapping.blobSize(0)], __COUNTER__>(acc);
return llama::View(sharedMapping, llama::Array<std::byte*, 1>{&sharedMem[0]});
}
Expand All @@ -102,9 +100,9 @@ struct BlurKernel
constexpr auto threadsPerBlock = ElemsPerBlock / Elems;
const auto threadIdxInBlock = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc);

const std::size_t bStart[2]
const int bStart[2]
= {bi[0] * ElemsPerBlock + threadIdxInBlock[0], bi[1] * ElemsPerBlock + threadIdxInBlock[1]};
const std::size_t bEnd[2] = {
const int bEnd[2] = {
alpaka::math::min(acc, bStart[0] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping().extents()[0]),
alpaka::math::min(acc, bStart[1] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping().extents()[1]),
};
Expand All @@ -117,8 +115,8 @@ struct BlurKernel
alpaka::syncBlockThreads(acc);
}

const std::size_t start[2] = {ti[0] * Elems, ti[1] * Elems};
const std::size_t end[2] = {
const int start[2] = {ti[0] * Elems, ti[1] * Elems};
const int end[2] = {
alpaka::math::min(acc, start[0] + Elems, oldImage.mapping().extents()[0] - 2 * KernelSize),
alpaka::math::min(acc, start[1] + Elems, oldImage.mapping().extents()[1] - 2 * KernelSize),
};
Expand All @@ -143,9 +141,9 @@ struct BlurKernel
for(auto a = iAStart; a < i_a_end; ++a)
{
if constexpr(SHARED)
sum += sharedView(std::size_t(b), std::size_t(a));
sum += sharedView(b, a);
else
sum += oldImage(std::size_t(b), std::size_t(a));
sum += oldImage(b, a);
}
sum /= FP((2 * KernelSize + 1) * (2 * KernelSize + 1));
newImage(y + KernelSize, x + KernelSize) = sum;
Expand All @@ -159,7 +157,7 @@ try
// ALPAKA
using Dim = alpaka::DimInt<2>;

using Acc = alpaka::ExampleDefaultAcc<Dim, std::size_t>;
using Acc = alpaka::ExampleDefaultAcc<Dim, int>;
// using Acc = alpaka::AccGpuCudaRt<Dim, Size>;
// using Acc = alpaka::AccCpuSerial<Dim, Size>;

Expand All @@ -171,19 +169,19 @@ try
const DevAcc devAcc = alpaka::getDevByIdx<PltfAcc>(0);
const DevHost devHost = alpaka::getDevByIdx<PltfHost>(0);
std::vector<Queue> queue;
for(std::size_t i = 0; i < CHUNK_COUNT; ++i)
for(int i = 0; i < CHUNK_COUNT; ++i)
queue.emplace_back(devAcc);

// ASYNCCOPY
std::size_t img_x = DEFAULT_IMG_X;
std::size_t img_y = DEFAULT_IMG_Y;
std::size_t buffer_x = DEFAULT_IMG_X + 2 * KERNEL_SIZE;
std::size_t buffer_y = DEFAULT_IMG_Y + 2 * KERNEL_SIZE;
int img_x = DEFAULT_IMG_X;
int img_y = DEFAULT_IMG_Y;
int buffer_x = DEFAULT_IMG_X + 2 * KERNEL_SIZE;
int buffer_y = DEFAULT_IMG_Y + 2 * KERNEL_SIZE;

constexpr std::size_t hardwareThreads = 2; // relevant for OpenMP2Threads
constexpr int hardwareThreads = 2; // relevant for OpenMP2Threads
using Distribution = common::ThreadsElemsDistribution<Acc, ELEMS_PER_BLOCK, hardwareThreads>;
constexpr std::size_t elemCount = Distribution::elemCount;
constexpr std::size_t threadCount = Distribution::threadCount;
constexpr int elemCount = Distribution::elemCount;
constexpr int threadCount = Distribution::threadCount;

std::vector<unsigned char> image;
std::string out_filename = "output.png";
Expand All @@ -207,13 +205,13 @@ try
}

// LLAMA
using ArrayIndex = llama::ArrayIndex<std::size_t, 2>;
using ArrayIndex = llama::ArrayIndex<int, 2>;

auto treeOperationList = llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()};
const auto hostMapping
= llama::mapping::tree::Mapping{llama::ArrayExtents{buffer_y, buffer_x}, treeOperationList, Pixel{}};
const auto devMapping = llama::mapping::tree::Mapping{
llama::ArrayExtents<std::size_t, CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
llama::ArrayExtents<int, CHUNK_SIZE + 2 * KERNEL_SIZE, CHUNK_SIZE + 2 * KERNEL_SIZE>{},
treeOperationList,
PixelOnAcc{}};
using DevMapping = std::decay_t<decltype(devMapping)>;
Expand All @@ -226,26 +224,26 @@ try

Stopwatch chrono;

auto hostBuffer = alpaka::allocBuf<std::byte, std::size_t>(devHost, hostBufferSize);
auto hostBuffer = alpaka::allocBuf<std::byte, int>(devHost, hostBufferSize);
auto hostView = viewAlpakaBuffer(hostMapping, hostBuffer);

std::vector<alpaka::Buf<DevHost, std::byte, alpaka::DimInt<1>, std::size_t>> hostChunkBuffer;
std::vector<alpaka::Buf<DevHost, std::byte, alpaka::DimInt<1>, int>> hostChunkBuffer;
std::vector<llama::View<DevMapping, std::byte*>> hostChunkView;

std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, std::size_t>> devOldBuffer;
std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, std::size_t>> devNewBuffer;
std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, int>> devOldBuffer;
std::vector<alpaka::Buf<DevAcc, std::byte, alpaka::DimInt<1>, int>> devNewBuffer;
std::vector<llama::View<DevMapping, std::byte*>> devOldView;
std::vector<llama::View<DevMapping, std::byte*>> devNewView;

for(std::size_t i = 0; i < CHUNK_COUNT; ++i)
for(int i = 0; i < CHUNK_COUNT; ++i)
{
hostChunkBuffer.push_back(alpaka::allocBuf<std::byte, std::size_t>(devHost, devBufferSize));
hostChunkBuffer.push_back(alpaka::allocBuf<std::byte, int>(devHost, devBufferSize));
hostChunkView.push_back(viewAlpakaBuffer(devMapping, hostChunkBuffer.back()));

devOldBuffer.push_back(alpaka::allocBuf<std::byte, std::size_t>(devAcc, devBufferSize));
devOldBuffer.push_back(alpaka::allocBuf<std::byte, int>(devAcc, devBufferSize));
devOldView.push_back(viewAlpakaBuffer(devMapping, devOldBuffer.back()));

devNewBuffer.push_back(alpaka::allocBuf<std::byte, std::size_t>(devAcc, devBufferSize));
devNewBuffer.push_back(alpaka::allocBuf<std::byte, int>(devAcc, devBufferSize));
devNewView.push_back(viewAlpakaBuffer(devMapping, devNewBuffer.back()));
}

Expand All @@ -256,10 +254,10 @@ try
image.resize(img_x * img_y * 3);
std::default_random_engine generator;
std::normal_distribution<FP> distribution{FP(0), FP(0.5)};
for(std::size_t y = 0; y < buffer_y; ++y)
for(int y = 0; y < buffer_y; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < buffer_x; ++x)
for(int x = 0; x < buffer_x; ++x)
{
hostView(y, x)(tag::R()) = std::abs(distribution(generator));
hostView(y, x)(tag::G()) = std::abs(distribution(generator));
Expand All @@ -269,13 +267,13 @@ try
}
else
{
for(std::size_t y = 0; y < buffer_y; ++y)
for(int y = 0; y < buffer_y; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < buffer_x; ++x)
for(int x = 0; x < buffer_x; ++x)
{
const auto X = std::clamp<std::size_t>(x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1);
const auto Y = std::clamp<std::size_t>(y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1);
const auto X = std::clamp<int>(x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1);
const auto Y = std::clamp<int>(y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1);
const auto* pixel = &image[((Y - KERNEL_SIZE) * img_x + X - KERNEL_SIZE) * 3];
hostView(y, x)(tag::R()) = FP(pixel[0]) / 255;
hostView(y, x)(tag::G()) = FP(pixel[1]) / 255;
Expand All @@ -285,25 +283,25 @@ try
}

chrono.printAndReset("Init");
const auto elems = alpaka::Vec<Dim, size_t>(elemCount, elemCount);
const auto threads = alpaka::Vec<Dim, size_t>(threadCount, threadCount);
const auto blocks = alpaka::Vec<Dim, size_t>(
static_cast<size_t>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK),
static_cast<size_t>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK));
const alpaka::Vec<Dim, size_t> chunks(
static_cast<size_t>((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE),
static_cast<size_t>((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE));
const auto elems = alpaka::Vec<Dim, int>(elemCount, elemCount);
const auto threads = alpaka::Vec<Dim, int>(threadCount, threadCount);
const auto blocks = alpaka::Vec<Dim, int>(
static_cast<int>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK),
static_cast<int>((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK));
const alpaka::Vec<Dim, int> chunks(
static_cast<int>((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE),
static_cast<int>((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE));

const auto workdiv = alpaka::WorkDivMembers<Dim, size_t>{blocks, threads, elems};
const auto workdiv = alpaka::WorkDivMembers<Dim, int>{blocks, threads, elems};

struct VirtualHostElement
{
llama::VirtualView<decltype(hostView)&> virtualHost;
const llama::ArrayExtentsDynamic<std::size_t, 2> validMiniSize;
const llama::ArrayExtentsDynamic<int, 2> validMiniSize;
};
std::list<VirtualHostElement> virtualHostList;
for(std::size_t chunk_y = 0; chunk_y < chunks[0]; ++chunk_y)
for(std::size_t chunk_x = 0; chunk_x < chunks[1]; ++chunk_x)
for(int chunk_y = 0; chunk_y < chunks[0]; ++chunk_y)
for(int chunk_x = 0; chunk_x < chunks[1]; ++chunk_x)
{
// Create virtual view with size of mini view
const auto validMiniSize = llama::ArrayExtents{
Expand All @@ -312,7 +310,7 @@ try
llama::VirtualView virtualHost(hostView, {chunk_y * CHUNK_SIZE, chunk_x * CHUNK_SIZE});

// Find free chunk stream
std::size_t chunkNr = virtualHostList.size();
int chunkNr = virtualHostList.size();
if(virtualHostList.size() < CHUNK_COUNT)
virtualHostList.push_back({virtualHost, validMiniSize});
else
Expand All @@ -327,10 +325,10 @@ try
{
// Copy data back
LLAMA_INDEPENDENT_DATA
for(std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
for(int y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
for(int x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE)
= hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE);
}
Expand All @@ -347,10 +345,10 @@ try
}

// Copy data from virtual view to mini view
for(std::size_t y = 0; y < validMiniSize[0]; ++y)
for(int y = 0; y < validMiniSize[0]; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < validMiniSize[1]; ++x)
for(int x = 0; x < validMiniSize[1]; ++x)
hostChunkView[chunkNr](y, x) = virtualHost(y, x);
}
alpaka::memcpy(queue[chunkNr], devOldBuffer[chunkNr], hostChunkBuffer[chunkNr], devBufferSize);
Expand All @@ -367,14 +365,14 @@ try

// Wait for not finished tasks on accelerator
auto chunkIt = virtualHostList.begin();
for(std::size_t chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr)
for(int chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr)
{
alpaka::wait(queue[chunkNr]);
// Copy data back
for(std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
for(int y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
for(int x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x)
chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE)
= hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE);
}
Expand All @@ -384,10 +382,10 @@ try

if(SAVE)
{
for(std::size_t y = 0; y < img_y; ++y)
for(int y = 0; y < img_y; ++y)
{
LLAMA_INDEPENDENT_DATA
for(std::size_t x = 0; x < img_x; ++x)
for(int x = 0; x < img_x; ++x)
{
auto* pixel = &image[(y * img_x + x) * 3];
pixel[0] = static_cast<unsigned char>(hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::R()) * 255.);
Expand Down

0 comments on commit c2697f0

Please sign in to comment.