From 3fba7bb5f6f97b3298d89192a1e39b81df8d7923 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 10 Jun 2022 18:43:07 +0200 Subject: [PATCH 1/4] Fix finding and configuring alpaka The naming of alpaka's cmake variables changed upstream. --- .github/workflows/ci.yaml | 8 ++++---- CMakeLists.txt | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6c359f9148..234cc7140f 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -52,7 +52,7 @@ jobs: run: | mkdir build cd build - cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$CONFIG -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -DALPAKA_ACC_CPU_DISABLE_ATOMIC_REF=ON -DALPAKA_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake + cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$CONFIG -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -Dalpaka_ACC_CPU_DISABLE_ATOMIC_REF=ON -Dalpaka_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=/usr/local/share/vcpkg/scripts/buildsystems/vcpkg.cmake sed -i 's/\(-forward-unknown-to-host-compiler\|--generate-code=arch=[^ ]\+\|--expt-extended-lambda\|--expt-relaxed-constexpr\|--use_fast_math\)//g' compile_commands.json # remove NVCC specific flags which clang cannot handle run-clang-tidy-14 -header-filter='^((?!/thirdparty/).)*$' -extra-arg=--no-cuda-version-check -extra-arg=-nocudalib -extra-arg=-Wno-unused-command-line-argument '^(?!.*'$PWD').*$' @@ -243,7 +243,7 @@ jobs: unset CUDACXX fi echo "nvcc is here: $CUDACXX" - cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -DLLAMA_ENABLE_ASAN_FOR_TESTS=ON -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${{ !matrix.cuda }} -DALPAKA_ACC_CPU_DISABLE_ATOMIC_REF=ON -DALPAKA_ACC_GPU_CUDA_ENABLE=${{ matrix.cuda }} -DALPAKA_CXX_STANDARD=17 -DCMAKE_CUDA_COMPILER=$CUDACXX -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake + cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -DLLAMA_ENABLE_ASAN_FOR_TESTS=ON -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=${{ !matrix.cuda }} -Dalpaka_ACC_CPU_DISABLE_ATOMIC_REF=ON -Dalpaka_ACC_GPU_CUDA_ENABLE=${{ matrix.cuda }} -Dalpaka_CXX_STANDARD=17 -DCMAKE_CUDA_COMPILER=$CUDACXX -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake - name: build tests + examples run: | if [ ${{ matrix.install_oneapi }} ]; then source /opt/intel/oneapi/setvars.sh; fi @@ -283,7 +283,7 @@ jobs: run: | mkdir build cd build - cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON "-DCMAKE_TOOLCHAIN_FILE=$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake" + cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON "-DCMAKE_TOOLCHAIN_FILE=$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake" - name: build tests + examples run: cmake --build build -j $env:THREADS --config $env:CONFIG - name: run tests @@ -320,7 +320,7 @@ jobs: run: | mkdir build cd build - cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -DALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -DALPAKA_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake + cmake .. -DBUILD_TESTING=ON -DLLAMA_BUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=$CONFIG -Dalpaka_ACC_CPU_B_SEQ_T_SEQ_ENABLE=ON -Dalpaka_CXX_STANDARD=17 -DCMAKE_TOOLCHAIN_FILE=$VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake - name: build tests + examples run: | cmake --build build -j $THREADS diff --git a/CMakeLists.txt b/CMakeLists.txt index ce94c978ae..07d72cba48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,13 +99,13 @@ if (LLAMA_BUILD_EXAMPLES) # alpaka examples find_package(alpaka 0.9.0 QUIET) - if (_ALPAKA_FOUND) + if (_alpaka_FOUND) add_subdirectory("examples/alpaka/nbody") add_subdirectory("examples/alpaka/vectoradd") add_subdirectory("examples/alpaka/asyncblur") add_subdirectory("examples/alpaka/pic") add_subdirectory("examples/alpaka/daxpy") - elseif() + else() message(WARNING "Could not find alpaka. Alpaka examples are disabled.") endif() From f7112fee2af1f924e0c784fe5867c9576d86261f Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 10 Jun 2022 18:52:51 +0200 Subject: [PATCH 2/4] Fix usage of extents in alpaka examples --- examples/alpaka/asyncblur/asyncblur.cpp | 4 +-- examples/alpaka/daxpy/daxpy.cpp | 34 ++++++++++++++++--------- examples/alpaka/nbody/nbody.cpp | 2 +- examples/alpaka/pic/pic.cpp | 4 +-- examples/alpaka/vectoradd/vectoradd.cpp | 2 +- 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/examples/alpaka/asyncblur/asyncblur.cpp b/examples/alpaka/asyncblur/asyncblur.cpp index 3855bdcf1b..b70b2ca5ba 100644 --- a/examples/alpaka/asyncblur/asyncblur.cpp +++ b/examples/alpaka/asyncblur/asyncblur.cpp @@ -207,7 +207,7 @@ try } // LLAMA - using ArrayIndex = llama::ArrayIndex<2>; + using ArrayIndex = llama::ArrayIndex; auto treeOperationList = llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}; const auto hostMapping @@ -299,7 +299,7 @@ try struct VirtualHostElement { llama::VirtualView virtualHost; - const llama::ArrayExtentsDynamic<2> validMiniSize; + const llama::ArrayExtentsDynamic validMiniSize; }; std::list virtualHostList; for(std::size_t chunk_y = 0; chunk_y < chunks[0]; ++chunk_y) diff --git a/examples/alpaka/daxpy/daxpy.cpp b/examples/alpaka/daxpy/daxpy.cpp index 4943dd4e41..3e7a21ac64 100644 --- a/examples/alpaka/daxpy/daxpy.cpp +++ b/examples/alpaka/daxpy/daxpy.cpp @@ -183,16 +183,20 @@ set ylabel "runtime [s]" const auto extents = llama::ArrayExtents{PROBLEM_SIZE}; daxpy_alpaka_llama("AoS", plotFile, llama::mapping::AoS{extents, double{}}); - daxpy_alpaka_llama("SoA", plotFile, llama::mapping::SoA, double, false>{extents}); + daxpy_alpaka_llama( + "SoA", + plotFile, + llama::mapping::SoA, double, false>{extents}); daxpy_alpaka_llama( "Bytesplit", plotFile, - llama::mapping::Bytesplit, double, llama::mapping::BindAoS<>::fn>{extents}); + llama::mapping::Bytesplit, double, llama::mapping::BindAoS<>::fn>{ + extents}); daxpy_alpaka_llama( "ChangeType D->F", plotFile, llama::mapping::ChangeType< - llama::ArrayExtentsDynamic<1>, + llama::ArrayExtentsDynamic, double, llama::mapping::BindAoS<>::fn, boost::mp11::mp_list>>{extents}); @@ -200,23 +204,29 @@ set ylabel "runtime [s]" daxpy_alpaka_llama( "Bitpack 52^{11} CT", plotFile, - llama::mapping:: - BitPackedFloatSoA, double, llama::Constant<11>, llama::Constant<52>>{ - extents}); + llama::mapping::BitPackedFloatSoA< + llama::ArrayExtentsDynamic, + double, + llama::Constant<11>, + llama::Constant<52>>{extents}); daxpy_alpaka_llama("Bitpack 23^{8}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 8, 23, double{}}); daxpy_alpaka_llama( "Bitpack 23^{8} CT", plotFile, - llama::mapping:: - BitPackedFloatSoA, double, llama::Constant<8>, llama::Constant<23>>{ - extents}); + llama::mapping::BitPackedFloatSoA< + llama::ArrayExtentsDynamic, + double, + llama::Constant<8>, + llama::Constant<23>>{extents}); daxpy_alpaka_llama("Bitpack 10^{5}", plotFile, llama::mapping::BitPackedFloatSoA{extents, 5, 10, double{}}); daxpy_alpaka_llama( "Bitpack 10^{5} CT", plotFile, - llama::mapping:: - BitPackedFloatSoA, double, llama::Constant<5>, llama::Constant<10>>{ - extents}); + llama::mapping::BitPackedFloatSoA< + llama::ArrayExtentsDynamic, + double, + llama::Constant<5>, + llama::Constant<10>>{extents}); plotFile << R"(EOD plot $data using 2:xtic(1) diff --git a/examples/alpaka/nbody/nbody.cpp b/examples/alpaka/nbody/nbody.cpp index 75fc24c898..433bf2ff86 100644 --- a/examples/alpaka/nbody/nbody.cpp +++ b/examples/alpaka/nbody/nbody.cpp @@ -262,7 +262,7 @@ void run(std::ostream& plotFile) auto mapping = [] { - using ArrayExtents = llama::ArrayExtentsDynamic<1, int>; + using ArrayExtents = llama::ArrayExtentsDynamic; const auto extents = ArrayExtents{PROBLEM_SIZE}; if constexpr(MappingGM == AoS) return llama::mapping::AoS{extents}; diff --git a/examples/alpaka/pic/pic.cpp b/examples/alpaka/pic/pic.cpp index a778427212..67137d70f7 100644 --- a/examples/alpaka/pic/pic.cpp +++ b/examples/alpaka/pic/pic.cpp @@ -263,7 +263,7 @@ auto setup(Queue& queue, const Dev& dev, const DevHost& devHost) const auto fieldMapping = [] { - using ArrayExtents = llama::ArrayExtentsDynamic<2>; + using ArrayExtents = llama::ArrayExtentsDynamic; const auto fieldExtents = ArrayExtents{{X_, Y_}}; if constexpr(FieldMapping == 0) return llama::mapping::AoS(fieldExtents); @@ -309,7 +309,7 @@ auto setup(Queue& queue, const Dev& dev, const DevHost& devHost) auto particleMapping = [&] { - using ArrayExtents = llama::ArrayExtentsDynamic<1>; + using ArrayExtents = llama::ArrayExtentsDynamic; const auto particleExtents = ArrayExtents{numpart}; if constexpr(ParticleMapping == 0) return llama::mapping::AoS{particleExtents}; diff --git a/examples/alpaka/vectoradd/vectoradd.cpp b/examples/alpaka/vectoradd/vectoradd.cpp index 6677ee3bd6..157a42efc2 100644 --- a/examples/alpaka/vectoradd/vectoradd.cpp +++ b/examples/alpaka/vectoradd/vectoradd.cpp @@ -82,7 +82,7 @@ try // LLAMA const auto mapping = [&] { - using ArrayExtents = llama::ArrayExtentsDynamic<1>; + using ArrayExtents = llama::ArrayExtentsDynamic; const auto extents = ArrayExtents{PROBLEM_SIZE}; if constexpr(MAPPING == 0) return llama::mapping::AoS{extents}; From c2697f00a295e2402738bed2fd520e88432d48da Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 10 Jun 2022 19:02:24 +0200 Subject: [PATCH 3/4] Convert indices in asyncblur to int --- examples/alpaka/asyncblur/asyncblur.cpp | 114 ++++++++++++------------ 1 file changed, 56 insertions(+), 58 deletions(-) diff --git a/examples/alpaka/asyncblur/asyncblur.cpp b/examples/alpaka/asyncblur/asyncblur.cpp index b70b2ca5ba..2707ec6ae8 100644 --- a/examples/alpaka/asyncblur/asyncblur.cpp +++ b/examples/alpaka/asyncblur/asyncblur.cpp @@ -71,7 +71,7 @@ using PixelOnAcc = llama::Record< /** Alpaka kernel functor used to blur a small image living in the device memory * using the \ref PixelOnAcc record dimension */ -template +template struct BlurKernel { template @@ -85,10 +85,8 @@ struct BlurKernel { // Using SoA for the shared memory constexpr auto sharedChunkSize = ElemsPerBlock + 2 * KernelSize; - constexpr auto sharedMapping = llama::mapping::SoA< - llama::ArrayExtents, - typename View::RecordDim, - false>{}; + constexpr auto sharedMapping = llama::mapping:: + SoA, typename View::RecordDim, false>{}; auto& sharedMem = alpaka::declareSharedVar(acc); return llama::View(sharedMapping, llama::Array{&sharedMem[0]}); } @@ -102,9 +100,9 @@ struct BlurKernel constexpr auto threadsPerBlock = ElemsPerBlock / Elems; const auto threadIdxInBlock = alpaka::getIdx(acc); - const std::size_t bStart[2] + const int bStart[2] = {bi[0] * ElemsPerBlock + threadIdxInBlock[0], bi[1] * ElemsPerBlock + threadIdxInBlock[1]}; - const std::size_t bEnd[2] = { + const int bEnd[2] = { alpaka::math::min(acc, bStart[0] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping().extents()[0]), alpaka::math::min(acc, bStart[1] + ElemsPerBlock + 2 * KernelSize, oldImage.mapping().extents()[1]), }; @@ -117,8 +115,8 @@ struct BlurKernel alpaka::syncBlockThreads(acc); } - const std::size_t start[2] = {ti[0] * Elems, ti[1] * Elems}; - const std::size_t end[2] = { + const int start[2] = {ti[0] * Elems, ti[1] * Elems}; + const int end[2] = { alpaka::math::min(acc, start[0] + Elems, oldImage.mapping().extents()[0] - 2 * KernelSize), alpaka::math::min(acc, start[1] + Elems, oldImage.mapping().extents()[1] - 2 * KernelSize), }; @@ -143,9 +141,9 @@ struct BlurKernel for(auto a = iAStart; a < i_a_end; ++a) { if constexpr(SHARED) - sum += sharedView(std::size_t(b), std::size_t(a)); + sum += sharedView(b, a); else - sum += oldImage(std::size_t(b), std::size_t(a)); + sum += oldImage(b, a); } sum /= FP((2 * KernelSize + 1) * (2 * KernelSize + 1)); newImage(y + KernelSize, x + KernelSize) = sum; @@ -159,7 +157,7 @@ try // ALPAKA using Dim = alpaka::DimInt<2>; - using Acc = alpaka::ExampleDefaultAcc; + using Acc = alpaka::ExampleDefaultAcc; // using Acc = alpaka::AccGpuCudaRt; // using Acc = alpaka::AccCpuSerial; @@ -171,19 +169,19 @@ try const DevAcc devAcc = alpaka::getDevByIdx(0); const DevHost devHost = alpaka::getDevByIdx(0); std::vector queue; - for(std::size_t i = 0; i < CHUNK_COUNT; ++i) + for(int i = 0; i < CHUNK_COUNT; ++i) queue.emplace_back(devAcc); // ASYNCCOPY - std::size_t img_x = DEFAULT_IMG_X; - std::size_t img_y = DEFAULT_IMG_Y; - std::size_t buffer_x = DEFAULT_IMG_X + 2 * KERNEL_SIZE; - std::size_t buffer_y = DEFAULT_IMG_Y + 2 * KERNEL_SIZE; + int img_x = DEFAULT_IMG_X; + int img_y = DEFAULT_IMG_Y; + int buffer_x = DEFAULT_IMG_X + 2 * KERNEL_SIZE; + int buffer_y = DEFAULT_IMG_Y + 2 * KERNEL_SIZE; - constexpr std::size_t hardwareThreads = 2; // relevant for OpenMP2Threads + constexpr int hardwareThreads = 2; // relevant for OpenMP2Threads using Distribution = common::ThreadsElemsDistribution; - constexpr std::size_t elemCount = Distribution::elemCount; - constexpr std::size_t threadCount = Distribution::threadCount; + constexpr int elemCount = Distribution::elemCount; + constexpr int threadCount = Distribution::threadCount; std::vector image; std::string out_filename = "output.png"; @@ -207,13 +205,13 @@ try } // LLAMA - using ArrayIndex = llama::ArrayIndex; + using ArrayIndex = llama::ArrayIndex; auto treeOperationList = llama::Tuple{llama::mapping::tree::functor::LeafOnlyRT()}; const auto hostMapping = llama::mapping::tree::Mapping{llama::ArrayExtents{buffer_y, buffer_x}, treeOperationList, Pixel{}}; const auto devMapping = llama::mapping::tree::Mapping{ - llama::ArrayExtents{}, + llama::ArrayExtents{}, treeOperationList, PixelOnAcc{}}; using DevMapping = std::decay_t; @@ -226,26 +224,26 @@ try Stopwatch chrono; - auto hostBuffer = alpaka::allocBuf(devHost, hostBufferSize); + auto hostBuffer = alpaka::allocBuf(devHost, hostBufferSize); auto hostView = viewAlpakaBuffer(hostMapping, hostBuffer); - std::vector, std::size_t>> hostChunkBuffer; + std::vector, int>> hostChunkBuffer; std::vector> hostChunkView; - std::vector, std::size_t>> devOldBuffer; - std::vector, std::size_t>> devNewBuffer; + std::vector, int>> devOldBuffer; + std::vector, int>> devNewBuffer; std::vector> devOldView; std::vector> devNewView; - for(std::size_t i = 0; i < CHUNK_COUNT; ++i) + for(int i = 0; i < CHUNK_COUNT; ++i) { - hostChunkBuffer.push_back(alpaka::allocBuf(devHost, devBufferSize)); + hostChunkBuffer.push_back(alpaka::allocBuf(devHost, devBufferSize)); hostChunkView.push_back(viewAlpakaBuffer(devMapping, hostChunkBuffer.back())); - devOldBuffer.push_back(alpaka::allocBuf(devAcc, devBufferSize)); + devOldBuffer.push_back(alpaka::allocBuf(devAcc, devBufferSize)); devOldView.push_back(viewAlpakaBuffer(devMapping, devOldBuffer.back())); - devNewBuffer.push_back(alpaka::allocBuf(devAcc, devBufferSize)); + devNewBuffer.push_back(alpaka::allocBuf(devAcc, devBufferSize)); devNewView.push_back(viewAlpakaBuffer(devMapping, devNewBuffer.back())); } @@ -256,10 +254,10 @@ try image.resize(img_x * img_y * 3); std::default_random_engine generator; std::normal_distribution distribution{FP(0), FP(0.5)}; - for(std::size_t y = 0; y < buffer_y; ++y) + for(int y = 0; y < buffer_y; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < buffer_x; ++x) + for(int x = 0; x < buffer_x; ++x) { hostView(y, x)(tag::R()) = std::abs(distribution(generator)); hostView(y, x)(tag::G()) = std::abs(distribution(generator)); @@ -269,13 +267,13 @@ try } else { - for(std::size_t y = 0; y < buffer_y; ++y) + for(int y = 0; y < buffer_y; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < buffer_x; ++x) + for(int x = 0; x < buffer_x; ++x) { - const auto X = std::clamp(x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1); - const auto Y = std::clamp(y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1); + const auto X = std::clamp(x, KERNEL_SIZE, img_x + KERNEL_SIZE - 1); + const auto Y = std::clamp(y, KERNEL_SIZE, img_y + KERNEL_SIZE - 1); const auto* pixel = &image[((Y - KERNEL_SIZE) * img_x + X - KERNEL_SIZE) * 3]; hostView(y, x)(tag::R()) = FP(pixel[0]) / 255; hostView(y, x)(tag::G()) = FP(pixel[1]) / 255; @@ -285,25 +283,25 @@ try } chrono.printAndReset("Init"); - const auto elems = alpaka::Vec(elemCount, elemCount); - const auto threads = alpaka::Vec(threadCount, threadCount); - const auto blocks = alpaka::Vec( - static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK), - static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK)); - const alpaka::Vec chunks( - static_cast((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE), - static_cast((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE)); + const auto elems = alpaka::Vec(elemCount, elemCount); + const auto threads = alpaka::Vec(threadCount, threadCount); + const auto blocks = alpaka::Vec( + static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK), + static_cast((CHUNK_SIZE + ELEMS_PER_BLOCK - 1) / ELEMS_PER_BLOCK)); + const alpaka::Vec chunks( + static_cast((img_y + CHUNK_SIZE - 1) / CHUNK_SIZE), + static_cast((img_x + CHUNK_SIZE - 1) / CHUNK_SIZE)); - const auto workdiv = alpaka::WorkDivMembers{blocks, threads, elems}; + const auto workdiv = alpaka::WorkDivMembers{blocks, threads, elems}; struct VirtualHostElement { llama::VirtualView virtualHost; - const llama::ArrayExtentsDynamic validMiniSize; + const llama::ArrayExtentsDynamic validMiniSize; }; std::list virtualHostList; - for(std::size_t chunk_y = 0; chunk_y < chunks[0]; ++chunk_y) - for(std::size_t chunk_x = 0; chunk_x < chunks[1]; ++chunk_x) + for(int chunk_y = 0; chunk_y < chunks[0]; ++chunk_y) + for(int chunk_x = 0; chunk_x < chunks[1]; ++chunk_x) { // Create virtual view with size of mini view const auto validMiniSize = llama::ArrayExtents{ @@ -312,7 +310,7 @@ try llama::VirtualView virtualHost(hostView, {chunk_y * CHUNK_SIZE, chunk_x * CHUNK_SIZE}); // Find free chunk stream - std::size_t chunkNr = virtualHostList.size(); + int chunkNr = virtualHostList.size(); if(virtualHostList.size() < CHUNK_COUNT) virtualHostList.push_back({virtualHost, validMiniSize}); else @@ -327,10 +325,10 @@ try { // Copy data back LLAMA_INDEPENDENT_DATA - for(std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y) + for(int y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x) + for(int x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x) chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE) = hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE); } @@ -347,10 +345,10 @@ try } // Copy data from virtual view to mini view - for(std::size_t y = 0; y < validMiniSize[0]; ++y) + for(int y = 0; y < validMiniSize[0]; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < validMiniSize[1]; ++x) + for(int x = 0; x < validMiniSize[1]; ++x) hostChunkView[chunkNr](y, x) = virtualHost(y, x); } alpaka::memcpy(queue[chunkNr], devOldBuffer[chunkNr], hostChunkBuffer[chunkNr], devBufferSize); @@ -367,14 +365,14 @@ try // Wait for not finished tasks on accelerator auto chunkIt = virtualHostList.begin(); - for(std::size_t chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) + for(int chunkNr = 0; chunkNr < CHUNK_COUNT; ++chunkNr) { alpaka::wait(queue[chunkNr]); // Copy data back - for(std::size_t y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y) + for(int y = 0; y < chunkIt->validMiniSize[0] - 2 * KERNEL_SIZE; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x) + for(int x = 0; x < chunkIt->validMiniSize[1] - 2 * KERNEL_SIZE; ++x) chunkIt->virtualHost(y + KERNEL_SIZE, x + KERNEL_SIZE) = hostChunkView[chunkNr](y + KERNEL_SIZE, x + KERNEL_SIZE); } @@ -384,10 +382,10 @@ try if(SAVE) { - for(std::size_t y = 0; y < img_y; ++y) + for(int y = 0; y < img_y; ++y) { LLAMA_INDEPENDENT_DATA - for(std::size_t x = 0; x < img_x; ++x) + for(int x = 0; x < img_x; ++x) { auto* pixel = &image[(y * img_x + x) * 3]; pixel[0] = static_cast(hostView(y + KERNEL_SIZE, x + KERNEL_SIZE)(tag::R()) * 255.); From 2a36ee97ddaee0e208b261abf3b5e99d69f54669 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 10 Jun 2022 19:02:37 +0200 Subject: [PATCH 4/4] Fix warnings in tree mapping --- include/llama/mapping/tree/Mapping.hpp | 4 +++- include/llama/mapping/tree/TreeFromDimensions.hpp | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/llama/mapping/tree/Mapping.hpp b/include/llama/mapping/tree/Mapping.hpp index f319da7540..1d2aa3c76d 100644 --- a/include/llama/mapping/tree/Mapping.hpp +++ b/include/llama/mapping/tree/Mapping.hpp @@ -219,7 +219,9 @@ namespace llama::mapping::tree // TODO(bgruber): propagate use of size_type auto const basicTreeCoord = createTreeCoord>(ai); auto const resultTreeCoord = mergedFunctors.basicCoordToResultCoord(basicTreeCoord, basicTree); - const auto offset = internal::getTreeBlobByte(resultTree, resultTreeCoord); + const auto offset = static_cast(internal::getTreeBlobByte( + resultTree, + resultTreeCoord)); // FIXME(bgruber): size_type should be propagated through getTreeBlobByte return {0, offset}; } }; diff --git a/include/llama/mapping/tree/TreeFromDimensions.hpp b/include/llama/mapping/tree/TreeFromDimensions.hpp index 835bd6c20e..95651b2879 100644 --- a/include/llama/mapping/tree/TreeFromDimensions.hpp +++ b/include/llama/mapping/tree/TreeFromDimensions.hpp @@ -127,11 +127,14 @@ namespace llama::mapping::tree LLAMA_FN_HOST_ACC_INLINE auto createTree(const ArrayIndex& size) { if constexpr(Pos == N - 1) - return TreeFromRecordDim{size[N - 1]}; + return TreeFromRecordDim{ + static_cast(size[N - 1])}; // FIXME(bgruber): propagate index type else { Tuple inner{createTree(size)}; - return Node{size[Pos], inner}; + return Node{ + static_cast(size[Pos]), + inner}; // FIXME(bgruber): propagate index type } };