diff --git a/examples/alpaka/asyncblur/asyncblur.cpp b/examples/alpaka/asyncblur/asyncblur.cpp index d0376a90a8..52b08ac154 100644 --- a/examples/alpaka/asyncblur/asyncblur.cpp +++ b/examples/alpaka/asyncblur/asyncblur.cpp @@ -44,7 +44,7 @@ auto viewAlpakaBuffer( Mapping& mapping, AlpakaBuffer& buffer) // taking mapping by & on purpose, so Mapping can deduce const { - return llama::View{mapping, {alpaka::mem::view::getPtrNative(buffer)}}; + return llama::View{mapping, {alpaka::getPtrNative(buffer)}}; } // clang-format off @@ -77,7 +77,7 @@ struct BlurKernel template LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View oldImage, View newImage) const { - const auto ti = alpaka::idx::getIdx(acc); + const auto ti = alpaka::getIdx(acc); [[maybe_unused]] auto sharedView = [&] { if constexpr (SHARED) @@ -89,18 +89,18 @@ struct BlurKernel llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}, typename View::DatumDomain{}); constexpr auto sharedMemSize = llama::sizeOf * sharedChunkSize * sharedChunkSize; - auto& sharedMem = alpaka::block::shared::st::allocVar(acc); + auto& sharedMem = alpaka::allocVar(acc); return llama::View{sharedMapping, llama::Array{&sharedMem[0]}}; } else return int{}; // dummy }(); - [[maybe_unused]] const auto bi = alpaka::idx::getIdx(acc); + [[maybe_unused]] const auto bi = alpaka::getIdx(acc); if constexpr (SHARED) { constexpr auto threadsPerBlock = ElemsPerBlock / Elems; - const auto threadIdxInBlock = alpaka::idx::getIdx(acc); + const auto threadIdxInBlock = alpaka::getIdx(acc); const std::size_t bStart[2] = {bi[0] * ElemsPerBlock + threadIdxInBlock[0], bi[1] * ElemsPerBlock + threadIdxInBlock[1]}; @@ -114,7 +114,7 @@ struct BlurKernel for (auto x = bStart[1]; x < bEnd[1]; x += threadsPerBlock) sharedView(y - bi[0] * ElemsPerBlock, x - bi[1] * ElemsPerBlock) = oldImage(y, x); - alpaka::block::sync::syncBlockThreads(acc); + alpaka::syncBlockThreads(acc); } const std::size_t start[2] = {ti[0] * Elems, ti[1] * Elems}; @@ -157,20 +157,20 @@ struct BlurKernel int main(int argc, char** argv) { // ALPAKA - using Dim = alpaka::dim::DimInt<2>; + using Dim = alpaka::DimInt<2>; - using Acc = alpaka::example::ExampleDefaultAcc; - // using Acc = alpaka::acc::AccGpuCudaRt; - // using Acc = alpaka::acc::AccCpuSerial; + using Acc = alpaka::ExampleDefaultAcc; + // using Acc = alpaka::AccGpuCudaRt; + // using Acc = alpaka::AccCpuSerial; using Queue - = alpaka::queue::Queue>; - using DevHost = alpaka::dev::DevCpu; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - const DevAcc devAcc = alpaka::pltf::getDevByIdx(0); - const DevHost devHost = alpaka::pltf::getDevByIdx(0); + = alpaka::Queue>; + using DevHost = alpaka::DevCpu; + using DevAcc = alpaka::Dev; + using PltfHost = alpaka::Pltf; + using PltfAcc = alpaka::Pltf; + const DevAcc devAcc = alpaka::getDevByIdx(0); + const DevHost devHost = alpaka::getDevByIdx(0); std::vector queue; for (std::size_t i = 0; i < CHUNK_COUNT; ++i) queue.push_back(Queue(devAcc)); @@ -225,25 +225,25 @@ int main(int argc, char** argv) Chrono chrono; - auto hostBuffer = alpaka::mem::buf::alloc(devHost, hostBufferSize); + auto hostBuffer = alpaka::allocBuf(devHost, hostBufferSize); auto hostView = viewAlpakaBuffer(hostMapping, hostBuffer); - std::vector, std::size_t>> hostChunkBuffer; + std::vector, std::size_t>> hostChunkBuffer; std::vector> hostChunkView; - std::vector, std::size_t>> devOldBuffer, + std::vector, std::size_t>> devOldBuffer, devNewBuffer; std::vector> devOldView, devNewView; for (std::size_t i = 0; i < CHUNK_COUNT; ++i) { - hostChunkBuffer.push_back(alpaka::mem::buf::alloc(devHost, devBufferSize)); + hostChunkBuffer.push_back(alpaka::allocBuf(devHost, devBufferSize)); hostChunkView.push_back(viewAlpakaBuffer(devMapping, hostChunkBuffer.back())); - devOldBuffer.push_back(alpaka::mem::buf::alloc(devAcc, devBufferSize)); + devOldBuffer.push_back(alpaka::allocBuf(devAcc, devBufferSize)); devOldView.push_back(viewAlpakaBuffer(devMapping, devOldBuffer.back())); - devNewBuffer.push_back(alpaka::mem::buf::alloc(devAcc, devBufferSize)); + devNewBuffer.push_back(alpaka::allocBuf(devAcc, devBufferSize)); devNewView.push_back(viewAlpakaBuffer(devMapping, devNewBuffer.back())); } @@ -283,16 +283,16 @@ int main(int argc, char** argv) } chrono.printAndReset("Init"); - const auto elems = alpaka::vec::Vec(elemCount, elemCount); - const auto threads = alpaka::vec::Vec(threadCount, threadCount); - const auto blocks = alpaka::vec::Vec( + const auto elems = alpaka::Vec(elemCount, elemCount); + const auto threads = alpaka::Vec(threadCount, threadCount); + const auto blocks = alpaka::Vec( static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK), static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK)); - const alpaka::vec::Vec chunks( + const alpaka::Vec chunks( static_cast((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE), static_cast((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE)); - const auto workdiv = alpaka::workdiv::WorkDivMembers{blocks, threads, elems}; + const auto workdiv = alpaka::WorkDivMembers{blocks, threads, elems}; struct VirtualHostElement { @@ -321,7 +321,7 @@ int main(int argc, char** argv) auto chunkIt = virtualHostList.begin(); for (chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) { - if (alpaka::queue::empty(queue[chunkNr])) + if (alpaka::empty(queue[chunkNr])) { // Copy data back LLAMA_INDEPENDENT_DATA @@ -351,23 +351,23 @@ int main(int argc, char** argv) for (std::size_t x = 0; x < validMiniSize[1]; ++x) hostChunkView[chunkNr](y, x) = virtualHost(y, x); } - alpaka::mem::view::copy(queue[chunkNr], devOldBuffer[chunkNr], hostChunkBuffer[chunkNr], devBufferSize); + alpaka::memcpy(queue[chunkNr], devOldBuffer[chunkNr], hostChunkBuffer[chunkNr], devBufferSize); - alpaka::kernel::exec( + alpaka::exec( queue[chunkNr], workdiv, BlurKernel{}, devOldView[chunkNr], devNewView[chunkNr]); - alpaka::mem::view::copy(queue[chunkNr], hostChunkBuffer[chunkNr], devNewBuffer[chunkNr], devBufferSize); + alpaka::memcpy(queue[chunkNr], hostChunkBuffer[chunkNr], devNewBuffer[chunkNr], devBufferSize); } // Wait for not finished tasks on accelerator auto chunkIt = virtualHostList.begin(); for (std::size_t chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) { - alpaka::wait::wait(queue[chunkNr]); + alpaka::wait(queue[chunkNr]); // Copy data back for (std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y) { diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 49d8ca5157..4bc2e65fcc 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -95,7 +95,7 @@ struct UpdateKernel else { constexpr auto sharedMemSize = llama::sizeOf * BlockSize; - auto& sharedMem = alpaka::block::shared::st::allocVar(acc); + auto& sharedMem = alpaka::allocVar(acc); return llama::View{sharedMapping, llama::Array{&sharedMem[0]}}; } } @@ -103,8 +103,8 @@ struct UpdateKernel return int{}; // dummy }(); - const auto ti = alpaka::idx::getIdx(acc)[0u]; - const auto tbi = alpaka::idx::getIdx(acc)[0]; + const auto ti = alpaka::getIdx(acc)[0u]; + const auto tbi = alpaka::getIdx(acc)[0]; const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); @@ -118,7 +118,7 @@ struct UpdateKernel LLAMA_INDEPENDENT_DATA for (auto pos2 = decltype(end2)(0); pos2 + ti < end2; pos2 += BlockSize / Elems) sharedView(pos2 + tbi) = particles(start2 + pos2 + tbi); - alpaka::block::sync::syncBlockThreads(acc); + alpaka::syncBlockThreads(acc); } LLAMA_INDEPENDENT_DATA for (auto pos2 = decltype(end2)(0); pos2 < end2; ++pos2) @@ -129,7 +129,7 @@ struct UpdateKernel else pPInteraction(particles(i), particles(start2 + pos2), ts); if constexpr (USE_SHARED) - alpaka::block::sync::syncBlockThreads(acc); + alpaka::syncBlockThreads(acc); } } }; @@ -142,7 +142,7 @@ struct MoveKernel template LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View particles, FP ts) const { - const auto ti = alpaka::idx::getIdx(acc)[0]; + const auto ti = alpaka::getIdx(acc)[0]; const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); @@ -155,20 +155,20 @@ struct MoveKernel int main(int argc, char** argv) { - using Dim = alpaka::dim::DimInt<1>; + using Dim = alpaka::DimInt<1>; using Size = std::size_t; - using Acc = alpaka::example::ExampleDefaultAcc; - // using Acc = alpaka::acc::AccGpuCudaRt; - // using Acc = alpaka::acc::AccCpuSerial; - - using DevHost = alpaka::dev::DevCpu; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - using Queue = alpaka::queue::Queue; - const DevAcc devAcc(alpaka::pltf::getDevByIdx(0u)); - const DevHost devHost(alpaka::pltf::getDevByIdx(0u)); + using Acc = alpaka::ExampleDefaultAcc; + // using Acc = alpaka::AccGpuCudaRt; + // using Acc = alpaka::AccCpuSerial; + + using DevHost = alpaka::DevCpu; + using DevAcc = alpaka::Dev; + using PltfHost = alpaka::Pltf; + using PltfAcc = alpaka::Pltf; + using Queue = alpaka::Queue; + const DevAcc devAcc(alpaka::getDevByIdx(0u)); + const DevHost devHost(alpaka::getDevByIdx(0u)); Queue queue(devAcc); // NBODY @@ -204,13 +204,13 @@ int main(int argc, char** argv) const auto bufferSize = Size(mapping.getBlobSize(0)); - auto hostBuffer = alpaka::mem::buf::alloc(devHost, bufferSize); - auto accBuffer = alpaka::mem::buf::alloc(devAcc, bufferSize); + auto hostBuffer = alpaka::allocBuf(devHost, bufferSize); + auto accBuffer = alpaka::allocBuf(devAcc, bufferSize); chrono.printAndReset("Alloc"); - auto hostView = llama::View{mapping, llama::Array{alpaka::mem::view::getPtrNative(hostBuffer)}}; - auto accView = llama::View{mapping, llama::Array{alpaka::mem::view::getPtrNative(accBuffer)}}; + auto hostView = llama::View{mapping, llama::Array{alpaka::getPtrNative(hostBuffer)}}; + auto accView = llama::View{mapping, llama::Array{alpaka::getPtrNative(accBuffer)}}; chrono.printAndReset("Views"); @@ -233,29 +233,29 @@ int main(int argc, char** argv) chrono.printAndReset("Init"); - alpaka::mem::view::copy(queue, accBuffer, hostBuffer, bufferSize); + alpaka::memcpy(queue, accBuffer, hostBuffer, bufferSize); chrono.printAndReset("Copy H->D"); - const alpaka::vec::Vec Elems(static_cast(elemCount)); - const alpaka::vec::Vec threads(static_cast(threadCount)); + const alpaka::Vec Elems(static_cast(elemCount)); + const alpaka::Vec threads(static_cast(threadCount)); constexpr auto innerCount = elemCount * threadCount; - const alpaka::vec::Vec blocks(static_cast((PROBLEM_SIZE + innerCount - 1u) / innerCount)); + const alpaka::Vec blocks(static_cast((PROBLEM_SIZE + innerCount - 1u) / innerCount)); - const auto workdiv = alpaka::workdiv::WorkDivMembers{blocks, threads, Elems}; + const auto workdiv = alpaka::WorkDivMembers{blocks, threads, Elems}; for (std::size_t s = 0; s < STEPS; ++s) { UpdateKernel updateKernel; - alpaka::kernel::exec(queue, workdiv, updateKernel, accView, ts); + alpaka::exec(queue, workdiv, updateKernel, accView, ts); chrono.printAndReset("Update kernel"); MoveKernel moveKernel; - alpaka::kernel::exec(queue, workdiv, moveKernel, accView, ts); + alpaka::exec(queue, workdiv, moveKernel, accView, ts); chrono.printAndReset("Move kernel"); } - alpaka::mem::view::copy(queue, hostBuffer, accBuffer, bufferSize); + alpaka::memcpy(queue, hostBuffer, accBuffer, bufferSize); chrono.printAndReset("Copy D->H"); return 0; diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp index 1495c36192..760ce4e03c 100644 --- a/examples/alpaka/vectoradd/vectoradd.cpp +++ b/examples/alpaka/vectoradd/vectoradd.cpp @@ -44,7 +44,7 @@ struct AddKernel template LLAMA_FN_HOST_ACC_INLINE void operator()(const Acc& acc, View a, View b) const { - const auto ti = alpaka::idx::getIdx(acc)[0]; + const auto ti = alpaka::getIdx(acc)[0]; const auto start = ti * Elems; const auto end = alpaka::math::min(acc, start + Elems, ProblemSize); @@ -62,20 +62,20 @@ struct AddKernel int main(int argc, char** argv) { // ALPAKA - using Dim = alpaka::dim::DimInt<1>; + using Dim = alpaka::DimInt<1>; using Size = std::size_t; - using Acc = alpaka::example::ExampleDefaultAcc; - // using Acc = alpaka::acc::AccGpuCudaRt; - // using Acc = alpaka::acc::AccCpuSerial; - - using DevHost = alpaka::dev::DevCpu; - using DevAcc = alpaka::dev::Dev; - using PltfHost = alpaka::pltf::Pltf; - using PltfAcc = alpaka::pltf::Pltf; - using Queue = alpaka::queue::Queue; - const DevAcc devAcc(alpaka::pltf::getDevByIdx(0u)); - const DevHost devHost(alpaka::pltf::getDevByIdx(0u)); + using Acc = alpaka::ExampleDefaultAcc; + // using Acc = alpaka::AccGpuCudaRt; + // using Acc = alpaka::AccCpuSerial; + + using DevHost = alpaka::DevCpu; + using DevAcc = alpaka::Dev; + using PltfHost = alpaka::Pltf; + using PltfAcc = alpaka::Pltf; + using Queue = alpaka::Queue; + const DevAcc devAcc(alpaka::getDevByIdx(0u)); + const DevHost devHost(alpaka::getDevByIdx(0u)); Queue queue(devAcc); // LLAMA @@ -105,18 +105,18 @@ int main(int argc, char** argv) const auto bufferSize = Size(mapping.getBlobSize(0)); // allocate buffers - auto hostBufferA = alpaka::mem::buf::alloc(devHost, bufferSize); - auto hostBufferB = alpaka::mem::buf::alloc(devHost, bufferSize); - auto devBufferA = alpaka::mem::buf::alloc(devAcc, bufferSize); - auto devBufferB = alpaka::mem::buf::alloc(devAcc, bufferSize); + auto hostBufferA = alpaka::allocBuf(devHost, bufferSize); + auto hostBufferB = alpaka::allocBuf(devHost, bufferSize); + auto devBufferA = alpaka::allocBuf(devAcc, bufferSize); + auto devBufferB = alpaka::allocBuf(devAcc, bufferSize); chrono.printAndReset("Alloc"); // create LLAMA views - auto hostA = llama::View{mapping, llama::Array{alpaka::mem::view::getPtrNative(hostBufferA)}}; - auto hostB = llama::View{mapping, llama::Array{alpaka::mem::view::getPtrNative(hostBufferB)}}; - auto devA = llama::View{mapping, llama::Array{alpaka::mem::view::getPtrNative(devBufferA)}}; - auto devB = llama::View{mapping, llama::Array{alpaka::mem::view::getPtrNative(devBufferB)}}; + auto hostA = llama::View{mapping, llama::Array{alpaka::getPtrNative(hostBufferA)}}; + auto hostB = llama::View{mapping, llama::Array{alpaka::getPtrNative(hostBufferB)}}; + auto devA = llama::View{mapping, llama::Array{alpaka::getPtrNative(devBufferA)}}; + auto devB = llama::View{mapping, llama::Array{alpaka::getPtrNative(devBufferB)}}; chrono.printAndReset("Views"); @@ -131,8 +131,8 @@ int main(int argc, char** argv) } chrono.printAndReset("Init"); - alpaka::mem::view::copy(queue, devBufferA, hostBufferA, bufferSize); - alpaka::mem::view::copy(queue, devBufferB, hostBufferB, bufferSize); + alpaka::memcpy(queue, devBufferA, hostBufferA, bufferSize); + alpaka::memcpy(queue, devBufferB, hostBufferB, bufferSize); chrono.printAndReset("Copy H->D"); @@ -140,21 +140,21 @@ int main(int argc, char** argv) using Distribution = common::ThreadsElemsDistribution; constexpr std::size_t elemCount = Distribution::elemCount; constexpr std::size_t threadCount = Distribution::threadCount; - const alpaka::vec::Vec elems(static_cast(elemCount)); - const alpaka::vec::Vec threads(static_cast(threadCount)); + const alpaka::Vec elems(static_cast(elemCount)); + const alpaka::Vec threads(static_cast(threadCount)); constexpr auto innerCount = elemCount * threadCount; - const alpaka::vec::Vec blocks(static_cast((PROBLEM_SIZE + innerCount - 1) / innerCount)); + const alpaka::Vec blocks(static_cast((PROBLEM_SIZE + innerCount - 1) / innerCount)); - const auto workdiv = alpaka::workdiv::WorkDivMembers{blocks, threads, elems}; + const auto workdiv = alpaka::WorkDivMembers{blocks, threads, elems}; for (std::size_t s = 0; s < STEPS; ++s) { - alpaka::kernel::exec(queue, workdiv, AddKernel{}, devA, devB); + alpaka::exec(queue, workdiv, AddKernel{}, devA, devB); chrono.printAndReset("Add kernel"); } - alpaka::mem::view::copy(queue, hostBufferA, devBufferA, bufferSize); - alpaka::mem::view::copy(queue, hostBufferB, devBufferB, bufferSize); + alpaka::memcpy(queue, hostBufferA, devBufferA, bufferSize); + alpaka::memcpy(queue, hostBufferB, devBufferB, bufferSize); chrono.printAndReset("Copy D->H"); diff --git a/examples/common/alpakaHelpers.hpp b/examples/common/alpakaHelpers.hpp index 215beea131..5d84360477 100644 --- a/examples/common/alpakaHelpers.hpp +++ b/examples/common/alpakaHelpers.hpp @@ -29,7 +29,7 @@ namespace common #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED template - struct ThreadsElemsDistribution, blockSize, hardwareThreads> + struct ThreadsElemsDistribution, blockSize, hardwareThreads> { static constexpr std::size_t elemCount = THREADELEMDIST_MIN_ELEM; static constexpr std::size_t threadCount = blockSize / THREADELEMDIST_MIN_ELEM; @@ -38,7 +38,7 @@ namespace common #ifdef ALPAKA_ACC_CPU_B_SEQ_T_OMP2_ENABLED template - struct ThreadsElemsDistribution, blockSize, hardwareThreads> + struct ThreadsElemsDistribution, blockSize, hardwareThreads> { static constexpr std::size_t elemCount = (blockSize + hardwareThreads - 1u) / hardwareThreads; static constexpr std::size_t threadCount = hardwareThreads;