diff --git a/src/include/mallocMC/allocator.hpp b/src/include/mallocMC/allocator.hpp index 9fe1f41b..74e0f5c6 100644 --- a/src/include/mallocMC/allocator.hpp +++ b/src/include/mallocMC/allocator.hpp @@ -32,7 +32,6 @@ #include "mallocMC_allocator_handle.hpp" #include "mallocMC_constraints.hpp" #include "mallocMC_traits.hpp" -#include "mallocMC_utils.hpp" #include @@ -113,11 +112,11 @@ namespace mallocMC using uint32 = std::uint32_t; public: - using CreationPolicy = T_CreationPolicy; using DistributionPolicy = T_DistributionPolicy; using OOMPolicy = T_OOMPolicy; using ReservePoolPolicy = T_ReservePoolPolicy; using AlignmentPolicy = T_AlignmentPolicy; + using CreationPolicy = T_CreationPolicy::template AlignmentAwarePolicy; using HeapInfoVector = std::vector; using DevAllocator = DeviceAllocator; using AllocatorHandle = AllocatorHandleImpl; @@ -135,13 +134,7 @@ namespace mallocMC * @param size number of bytes */ template - ALPAKA_FN_HOST void - /* `volatile size_t size` is required to break clang optimizations which - * results into runtime errors. Observed in PIConGPU if size is known at - * compile time. The volatile workaround has no negative effects on the - * register usage in CUDA. - */ - alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t volatile size) + ALPAKA_FN_HOST void alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t size) { void* pool = reservePolicy.setMemPool(dev, size); std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size); diff --git a/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/src/include/mallocMC/creationPolicies/OldMalloc.hpp index c75534a1..13ee173d 100644 --- a/src/include/mallocMC/creationPolicies/OldMalloc.hpp +++ b/src/include/mallocMC/creationPolicies/OldMalloc.hpp @@ -49,9 +49,12 @@ namespace mallocMC using uint32 = std::uint32_t; public: + template + using AlignmentAwarePolicy = OldMalloc; + static constexpr auto providesAvailableSlots = false; - template + template ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) const -> void* { return ::malloc(static_cast(bytes)); diff --git a/src/include/mallocMC/creationPolicies/Scatter.hpp b/src/include/mallocMC/creationPolicies/Scatter.hpp index 0de95765..38117491 100644 --- a/src/include/mallocMC/creationPolicies/Scatter.hpp +++ b/src/include/mallocMC/creationPolicies/Scatter.hpp @@ -34,11 +34,11 @@ #pragma once #include "../mallocMC_utils.hpp" -#include "Scatter.hpp" #include +#include +#include -#include #include #include #include /* uint32_t */ @@ -47,6 +47,7 @@ #include #include #include +#include namespace mallocMC { @@ -106,10 +107,16 @@ namespace mallocMC */ template< class T_Config = ScatterConf::DefaultScatterConfig, - class T_Hashing = ScatterConf::DefaultScatterHashingParams> - class Scatter + class T_Hashing = ScatterConf::DefaultScatterHashingParams, + class T_AlignmentPolicy = void> + class ScatterImpl { public: + // TODO(lenz): This is a bit of a round trip due to a change of interface. A larger refactoring should + // remove this again. + template + using AlignmentAwarePolicy = ScatterImpl; + using HeapProperties = T_Config; using HashingProperties = T_Hashing; @@ -294,7 +301,8 @@ namespace mallocMC * @param spots number of bits that can be used * @return next free spot in the bitfield */ - static ALPAKA_FN_ACC inline auto nextspot(uint32 bitfield, uint32 spot, uint32 spots) -> uint32 + static ALPAKA_FN_ACC inline auto nextspot(auto const& acc, uint32 bitfield, uint32 spot, uint32 spots) + -> uint32 { uint32 const low_part = (spot + 1) == sizeof(uint32) * CHAR_BIT ? 0u : (bitfield >> (spot + 1)); uint32 const high_part = (bitfield << (spots - (spot + 1))); @@ -302,7 +310,7 @@ namespace mallocMC // wrap around the bitfields from the current spot to the left bitfield = (high_part | low_part) & selection_mask; // compute the step from the current spot in the bitfield - uint32 const step = ffs(~bitfield); + uint32 const step = alpaka::ffs(acc, static_cast>(~bitfield)); // and return the new spot return (spot + step) % spots; } @@ -344,9 +352,9 @@ namespace mallocMC // note: popc(old) == spots should be sufficient, // but if someone corrupts the memory we end up in an // endless loop in here... - if(popc(old) >= spots) + if(alpaka::popcount(acc, old) >= static_cast(spots)) return -1; - spot = nextspot(old, spot, spots); + spot = nextspot(acc, old, spot, spots); } } @@ -376,10 +384,10 @@ namespace mallocMC if(fullsegments != 32) return alpaka::math::min( acc, - 31, + 31U, alpaka::math::max( acc, - 0, + 0U, (int) pagesize - (int) fullsegments * segmentsize - (int) sizeof(uint32)) / chunksize); else @@ -410,8 +418,8 @@ namespace mallocMC uint32 spot = randInit() % segments; uint32 const mask = _ptes[page].bitmask; if((mask & (1u << spot)) != 0) - spot = nextspot(mask, spot, segments); - uint32 const tries = segments - popc(mask); + spot = nextspot(acc, mask, spot, segments); + uint32 const tries = segments - alpaka::popcount(acc, mask); uint32* onpagemasks = onPageMasksPosition(page, segments); for(uint32 i = 0; i < tries; ++i) { @@ -419,7 +427,7 @@ namespace mallocMC if(hspot != -1) return _page[page].data + (32 * spot + hspot) * chunksize; alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, 1u << spot); - spot = nextspot(mask, spot, segments); + spot = nextspot(acc, mask, spot, segments); } return 0; } @@ -542,17 +550,17 @@ namespace mallocMC * @return pointer to a free chunk on a page, 0 if we were unable to * obtain a free chunk */ - template + template ALPAKA_FN_ACC auto allocChunked(AlpakaAcc const& acc, uint32 bytes) -> void* { // use the minimal allocation size to increase the hit rate for small allocations. - uint32 const paddedMinChunkSize = AlignmentPolicy::applyPadding(minChunkSize); + uint32 const paddedMinChunkSize = T_AlignmentPolicy::applyPadding(minChunkSize); uint32 const minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); uint32 const numpages = _numpages; uint32 const pagesperblock = numpages / _accessblocks; - uint32 const reloff = warpSize * minAllocation / pagesize; - uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid() - + (hashingDistWP + hashingDistWPRel * reloff) * warpid()) + uint32 const reloff = warpSize * minAllocation / pagesize; + uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid(acc) + + (hashingDistWP + hashingDistWPRel * reloff) * warpid(acc)) % pagesperblock; uint32 const maxchunksize = alpaka::math::min( acc, @@ -687,7 +695,7 @@ namespace mallocMC /** Take care that the meta data changes where we did not use atomics are propagated to all * other threads. */ - threadfenceDevice(acc); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); /* Remove chunk information. * It is important that this call happened after page init is called because scatter malloc * is updating the chunksize without notify the action by increasing the page count @@ -742,8 +750,9 @@ namespace mallocMC // mark it as free uint32 const nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); uint32* onpagemasks = onPageMasksPosition(page, nMasks); - uint32 old - = alpaka::atomicOp(acc, &onpagemasks[segment], ~(1u << withinsegment)); + /* currently unchecked: + * uint32 old = */ + alpaka::atomicOp(acc, &onpagemasks[segment], ~(1u << withinsegment)); // always do this, since it might fail due to a // race-condition with addChunkHierarchy @@ -768,7 +777,7 @@ namespace mallocMC alpaka::atomicOp(acc, (uint32*) (_regions + region), 0u); uint32 const pagesperblock = _numpages / _accessblocks; uint32 const block = page / pagesperblock; - if(warpid() + laneid() == 0) + if(warpid(acc) + laneid() == 0) alpaka::atomicOp(acc, (uint32*) &_firstfreeblock, block); } } @@ -817,7 +826,7 @@ namespace mallocMC uint32 endpage, uint32 bytes) -> void* { - uint32 const pagestoalloc = divup(bytes, pagesize); + uint32 const pagestoalloc = ceilingDivision(bytes, pagesize); uint32 freecount = 0; bool left_free = false; for(uint32 search_page = startpage + 1; search_page > endpage;) @@ -893,11 +902,11 @@ namespace mallocMC // only one thread per warp can acquire the mutex void* res = 0; // based on the alpaka backend the lanemask type can be 64bit - auto const mask = activemask(); - uint32_t const num = popc(mask); + auto const mask = alpaka::warp::activemask(acc); + uint32_t const num = alpaka::popcount(acc, mask); // based on the alpaka backend the lanemask type can be 64bit - auto const lanemask = lanemask_lt(); - uint32_t const local_id = popc(lanemask & mask); + auto const lanemask = lanemask_lt(acc); + uint32_t const local_id = alpaka::popcount(acc, lanemask & mask); for(unsigned int active = 0; active < num; ++active) if(active == local_id) res = allocPageBasedSingle(acc, bytes); @@ -913,11 +922,11 @@ namespace mallocMC template ALPAKA_FN_ACC void deallocPageBased(AlpakaAcc const& acc, void* mem, uint32 page, uint32 bytes) { - uint32 const pages = divup(bytes, pagesize); + uint32 const pages = ceilingDivision(bytes, pagesize); for(uint32 p = page; p < page + pages; ++p) _page[p].init(); - threadfenceDevice(acc); + alpaka::mem_fence(acc, alpaka::memory_scope::Device{}); for(uint32 p = page; p < page + pages; ++p) alpaka::atomicOp(acc, (uint32*) &_ptes[p].chunksize, bytes, 0u); @@ -931,7 +940,7 @@ namespace mallocMC * @param bytes number of bytes to allocate * @return pointer to the allocated memory */ - template + template ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) -> void* { if(bytes == 0) @@ -944,7 +953,7 @@ namespace mallocMC */ if(bytes <= pagesize) // chunck based - return allocChunked(acc, bytes); + return allocChunked(acc, bytes); else // allocate a range of pages return allocPageBased(acc, bytes); @@ -1008,23 +1017,6 @@ namespace mallocMC uint32 numpages = numregions * regionsize; // pointer is copied (copy is called page) Page* page = (Page*) memory; - // sec check for alignment - // copy is checked - // PointerEquivalent alignmentstatus = ((PointerEquivalent)page) - // & (16 -1); if(alignmentstatus != 0) - //{ - // if(linid == 0){ - // printf("c Before:\n"); - // printf("c dataAlignment: %d\n",16); - // printf("c Alignmentstatus: %d\n",alignmentstatus); - // printf("c size_t memsize %llu byte\n", memsize); - // printf("c void *memory %p\n", page); - // } - // //copy is adjusted, potentially pointer to higher address - // now. page =(Page*)(((PointerEquivalent)page) + 16 - - // alignmentstatus); if(linid == 0) printf("c Heap Warning: - // memory to use not 16 byte aligned...\n"); - //} // We have to calculate these values here, before using them for other things. // First calculate how many blocks of the given size fit our memory pages in principle. @@ -1056,16 +1048,6 @@ namespace mallocMC ptes = (PTE*) (page + numpages); regions = (uint32*) (ptes + numpages); - // if(linid == 0) printf("Heap info: wasting %d - // bytes\n",(((POINTEREQUIVALENT)memory) + memsize) - - // (POINTEREQUIVALENT)(regions + numregions)); - - // if(linid == 0 && alignmentstatus != 0){ - // printf("c Was shrinked automatically to:\n"); - // printf("c size_t memsize %llu byte\n", memsize); - // printf("c void *memory %p\n", page); - //} - for(uint32 i = linid; i < numpages; i += totalThreads) { ptes[i].init(); @@ -1116,11 +1098,7 @@ namespace mallocMC AlpakaAcc const& m_acc, T_DeviceAllocator* m_heap, void* m_heapmem, - size_t m_memsize) - { - m_heap->pool = m_heapmem; - m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize); - }; + size_t m_memsize) { m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize); }; using Dim = typename alpaka::trait::DimType::type; using Idx = typename alpaka::trait::IdxType::type; using VecType = alpaka::Vec; @@ -1199,7 +1177,7 @@ namespace mallocMC * @param stride the stride should be equal to the number of * different gids (and therefore of value max(gid)-1) */ - template + template ALPAKA_FN_ACC auto getAvailaibleSlotsDeviceFunction( AlpakaAcc const& acc, size_t slotSize, @@ -1223,7 +1201,7 @@ namespace mallocMC chunksize = alpaka::math::max( acc, (uint32) slotSize, - AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size + T_AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size slotcount += countFreeChunksInPage( acc, currentpage, @@ -1240,7 +1218,7 @@ namespace mallocMC { // 1 slot needs multiple pages if(gid > 0) return 0; // do this serially - uint32 const pagestoalloc = divup((uint32) slotSize, pagesize); + uint32 const pagestoalloc = ceilingDivision((uint32) slotSize, pagesize); uint32 freecount = 0; for(uint32 currentpage = _numpages; currentpage > 0;) { // this already includes all superblocks @@ -1296,8 +1274,8 @@ namespace mallocMC auto const gid = alpaka::getIdx(acc).sum(); auto const nWorker = alpaka::getWorkDiv(acc).prod(); - unsigned const temp = heapPtr->template getAvailaibleSlotsDeviceFunction< - typename T_DeviceAllocator::AlignmentPolicy>(acc, numBytes, gid, nWorker); + unsigned const temp + = heapPtr->template getAvailaibleSlotsDeviceFunction(acc, numBytes, gid, nWorker); if(temp) alpaka::atomicOp(acc, slots, temp); }; @@ -1354,21 +1332,22 @@ namespace mallocMC * * @param slotSize the size of allocatable elements to count */ - template + template ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(AlpakaAcc const& acc, size_t slotSize) -> unsigned { int const wId = warpid_withinblock(acc); // do not use warpid-function, since // this value is not guaranteed to // be stable across warp lifetime - uint32 const activeThreads = popc(activemask()); + uint32 const activeThreads = alpaka::popcount(acc, alpaka::warp::activemask(acc)); + constexpr auto warpsize = warpSize; auto& activePerWarp = alpaka::declareSharedVar< - std::uint32_t[maxThreadsPerBlock / warpSize], + std::uint32_t[maxThreadsPerBlock / warpsize], __COUNTER__>(acc); // maximum number of warps in a block auto& warpResults - = alpaka::declareSharedVar(acc); + = alpaka::declareSharedVar], __COUNTER__>(acc); warpResults[wId] = 0; activePerWarp[wId] = 0; @@ -1383,16 +1362,13 @@ namespace mallocMC // printf("Block %d, id %d: activeThreads=%d // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); - unsigned const temp = this->template getAvailaibleSlotsDeviceFunction( - acc, - slotSize, - linearId, - activeThreads); + unsigned const temp + = this->template getAvailaibleSlotsDeviceFunction(acc, slotSize, linearId, activeThreads); if(temp) alpaka::atomicOp(acc, &warpResults[wId], temp); alpaka::syncBlockThreads(acc); - threadfenceBlock(acc); + alpaka::mem_fence(acc, alpaka::memory_scope::Block{}); return warpResults[wId]; } @@ -1416,5 +1392,12 @@ namespace mallocMC } }; + template + struct Scatter + { + template + using AlignmentAwarePolicy = ScatterImpl; + }; + } // namespace CreationPolicies } // namespace mallocMC diff --git a/src/include/mallocMC/device_allocator.hpp b/src/include/mallocMC/device_allocator.hpp index 52e4e736..0f6fe090 100644 --- a/src/include/mallocMC/device_allocator.hpp +++ b/src/include/mallocMC/device_allocator.hpp @@ -2,10 +2,11 @@ mallocMC: Memory Allocator for Many Core Architectures. https://www.hzdr.de/crp - Copyright 2014 - 2015 Institute of Radiation Physics, + Copyright 2014 - 2024 Institute of Radiation Physics, Helmholtz-Zentrum Dresden - Rossendorf Author(s): Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian J. Lenz - j.lenz ( at ) hzdr.de Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,9 +29,7 @@ #pragma once -#include "mallocMC_constraints.hpp" #include "mallocMC_traits.hpp" -#include "mallocMC_utils.hpp" #include @@ -58,7 +57,7 @@ namespace mallocMC typename T_DistributionPolicy, typename T_OOMPolicy, typename T_AlignmentPolicy> - class DeviceAllocator : public T_CreationPolicy + class DeviceAllocator : public T_CreationPolicy::template AlignmentAwarePolicy { using uint32 = std::uint32_t; @@ -68,24 +67,31 @@ namespace mallocMC using OOMPolicy = T_OOMPolicy; using AlignmentPolicy = T_AlignmentPolicy; - void* pool; - template ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t bytes) -> void* { + if(bytes == 0U) + { + return nullptr; + } bytes = AlignmentPolicy::applyPadding(bytes); DistributionPolicy distributionPolicy(acc); uint32 const req_size = distributionPolicy.collect(acc, bytes); - void* memBlock = CreationPolicy::template create(acc, req_size); + void* memBlock = CreationPolicy::template AlignmentAwarePolicy::create(acc, req_size); if(CreationPolicy::isOOM(memBlock, req_size)) + { memBlock = OOMPolicy::handleOOM(memBlock); + } return distributionPolicy.distribute(acc, memBlock); } template - ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* p) + ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* pointer) { - CreationPolicy::destroy(acc, p); + if(pointer != nullptr) + { + CreationPolicy::template AlignmentAwarePolicy::destroy(acc, pointer); + } } /** Provide the number of available free slots. @@ -101,9 +107,15 @@ namespace mallocMC { slotSize = AlignmentPolicy::applyPadding(slotSize); if constexpr(Traits::providesAvailableSlots) - return CreationPolicy::template getAvailableSlotsAccelerator(acc, slotSize); + { + return CreationPolicy::template AlignmentAwarePolicy::getAvailableSlotsAccelerator( + acc, + slotSize); + } else - return 0u; + { + return 0U; + } } }; diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp index eb4f3d59..fbfdd2d3 100644 --- a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp +++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp @@ -37,6 +37,7 @@ #include "XMallocSIMD.hpp" #include +#include #include #include @@ -125,7 +126,8 @@ namespace mallocMC // init with initial counter auto& warp_sizecounter - = alpaka::declareSharedVar(acc); + = alpaka::declareSharedVar()], __COUNTER__>( + acc); warp_sizecounter[warpid] = 16; // second half: make sure that all coalesced allocations can fit @@ -133,7 +135,7 @@ namespace mallocMC bool const coalescible = bytes > 0 && bytes < (pagesize / 32); #if(MALLOCMC_DEVICE_COMPILE) - threadcount = popc(ballot(coalescible)); + threadcount = alpaka::popcount(alpaka::warp::ballot(acc, coalescible)); #else threadcount = 1; // TODO #endif @@ -153,7 +155,8 @@ namespace mallocMC template ALPAKA_FN_ACC auto distribute(AlpakaAcc const& acc, void* allocatedMem) -> void* { - auto& warp_res = alpaka::declareSharedVar(acc); + auto& warp_res + = alpaka::declareSharedVar()], __COUNTER__>(acc); char* myalloc = (char*) allocatedMem; if(req_size && can_use_coalescing) diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp index c1c9b24f..ad43eb49 100644 --- a/src/include/mallocMC/mallocMC_utils.hpp +++ b/src/include/mallocMC/mallocMC_utils.hpp @@ -5,12 +5,13 @@ Copyright (C) 2012 Institute for Computer Graphics and Vision, Graz University of Technology - Copyright (C) 2014 Institute of Radiation Physics, + Copyright (C) 2014-2024 Institute of Radiation Physics, Helmholtz-Zentrum Dresden - Rossendorf Author(s): Markus Steinberger - steinberger ( at ) icg.tugraz.at Michael Kenzel - kenzel ( at ) icg.tugraz.at Carlchristian Eckert - c.eckert ( at ) hzdr.de + Julian Lenz - j.lenz ( at ) hzdr.de Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -34,16 +35,15 @@ #pragma once #include +#include + +#include #ifdef _MSC_VER # include #endif -#include #include -#include -#include -#include #include /* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__ @@ -56,39 +56,25 @@ namespace mallocMC { - template - class __PointerEquivalent - { - public: - using type = unsigned int; - }; - template<> - class __PointerEquivalent<8> - { - public: - using type = unsigned long long; - }; + template + constexpr uint32_t warpSize = 1U; -#if defined(__CUDA_ARCH__) - constexpr auto warpSize = 32; // TODO -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) -// defined: -// https://github.com/llvm/llvm-project/blob/62ec4ac90738a5f2d209ed28c822223e58aaaeb7/clang/lib/Basic/Targets/AMDGPU.cpp#L400 -// overview wave front size: -// https://github.com/llvm/llvm-project/blob/efc063b621ea0c4d1e452bcade62f7fc7e1cc937/clang/test/Driver/amdgpu-macros.cl#L70-L115 -// gfx10XX has 32 threads per wavefront else 64 +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + constexpr uint32_t warpSize> = 32U; +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED # if(HIP_VERSION_MAJOR >= 4) - constexpr auto warpSize = __AMDGCN_WAVEFRONT_SIZE; + template + constexpr uint32_t warpSize> = __AMDGCN_WAVEFRONT_SIZE; # else - constexpr auto warpSize = 64; + template + constexpr uint32_t warpSize> = 64; # endif -#else - constexpr auto warpSize = 1; #endif - using PointerEquivalent = mallocMC::__PointerEquivalent::type; - ALPAKA_FN_ACC inline auto laneid() { #if defined(__CUDA_ARCH__) @@ -98,7 +84,7 @@ namespace mallocMC #elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__) return __lane_id(); #else - return 0u; + return 0U; #endif } @@ -109,82 +95,87 @@ namespace mallocMC * * @return current index of the warp */ - ALPAKA_FN_ACC inline auto warpid() + template + ALPAKA_FN_ACC inline auto warpid(TAcc const& /*acc*/) -> uint32_t { -#if defined(__CUDA_ARCH__) - std::uint32_t mywarpid; + return 0U; + } + +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + // ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is + // specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.` + inline __device__ auto warpid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t + { + std::uint32_t mywarpid = 0; asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid)); return mywarpid; -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) + } +#endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto warpid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t + { // get wave id // https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024 return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4)); -#else - return 0u; + } #endif + + template + ALPAKA_FN_ACC inline auto smid(TAcc const& /*acc*/) -> uint32_t + { + return 0U; } - ALPAKA_FN_ACC inline auto smid() +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuCudaRt const& /*acc*/) -> uint32_t { -#if defined(__CUDA_ARCH__) - std::uint32_t mysmid; + std::uint32_t mysmid = 0; asm("mov.u32 %0, %%smid;" : "=r"(mysmid)); return mysmid; -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - return __smid(); -#else - return 0u; + } #endif + +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuHipRt const& /*acc*/) -> uint32_t + { + return __smid(); } +#endif - ALPAKA_FN_ACC inline auto lanemask_lt() + template + ALPAKA_FN_ACC inline auto lanemask_lt(TAcc const& /*acc*/) + { + return 0U; + } +#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED + template + ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuCudaRt const& /*acc*/) { -#if defined(__CUDA_ARCH__) std::uint32_t lanemask; asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask)); return lanemask; -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - return __lanemask_lt(); -#else - return 0u; -#endif } - - ALPAKA_FN_ACC inline auto ballot(int pred) - { -#if defined(__CUDA_ARCH__) - return __ballot_sync(__activemask(), pred); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return __ballot(pred); -#else - return 1u; #endif - } - ALPAKA_FN_ACC inline auto activemask() +#ifdef ALPAKA_ACC_GPU_HIP_ENABLED + template + ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuHipRt const& /*acc*/) { -#if defined(__CUDA_ARCH__) - return __activemask(); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return ballot(1); -#else - return 1u; -#endif + return __lanemask_lt(); } +#endif - template - ALPAKA_FN_HOST_ACC inline auto divup(T a, T b) -> T - { - return (a + b - 1) / b; - } /** the maximal number threads per block, valid for sm_2.X - sm_7.5 * * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities */ - constexpr uint32_t maxThreadsPerBlock = 1024; + constexpr uint32_t maxThreadsPerBlock = 1024U; /** warp id within a cuda block * @@ -199,96 +190,27 @@ namespace mallocMC auto const localId = alpaka::mapIdx<1>( alpaka::getIdx(acc), alpaka::getWorkDiv(acc))[0]; - return localId / warpSize; + return localId / warpSize; } - template - ALPAKA_FN_ACC inline auto ffs(T mask) -> std::uint32_t + template && std::is_integral_v>> + ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto ceilingDivision(T const numerator, U const denominator) -> T { -#if defined(__CUDA_ARCH__) - return ::__ffs(mask); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return ::__ffsll(static_cast(mask)); -#else - if(mask == 0) - return 0; - auto i = 1u; - while((mask & 1) == 0) - { - mask >>= 1; - i++; - } - return i; -#endif - } - - template - ALPAKA_FN_ACC inline auto popc(T mask) -> std::uint32_t - { -#if defined(__CUDA_ARCH__) - return ::__popc(mask); -#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP) - // return value is 64bit for HIP-clang - return ::__popcll(static_cast(mask)); -#else - // cf. - // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan - std::uint32_t count = 0; - while(mask) - { - count++; - mask &= mask - 1; - } - return count; -#endif + return (numerator + (denominator - 1)) / denominator; } - // Threadfence implementations will maybe moved later into alpaka - template - struct ThreadFence - { - // CPU only implementation - static void device() - { - std::atomic_thread_fence(std::memory_order_seq_cst); - } - - static void block() - { - std::atomic_thread_fence(std::memory_order_seq_cst); - } - }; - - template - struct ThreadFence, void> - { - static ALPAKA_FN_ACC void device() - { -#if MALLOCMC_DEVICE_COMPILE - __threadfence(); -#endif - } - - static ALPAKA_FN_ACC void block() - { -#if MALLOCMC_DEVICE_COMPILE - __threadfence_block(); -#endif - } - }; - - ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC void threadfenceDevice(T_Acc const& acc) + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto indexOf( + void const* const pointer, + void const* const start, + T_size const stepSize) -> std::make_signed_t { - ThreadFence::device(); + return std::distance(reinterpret_cast(start), reinterpret_cast(pointer)) / stepSize; } - ALPAKA_NO_HOST_ACC_WARNING - template - ALPAKA_FN_ACC void threadfenceBlock(T_Acc const& acc) + template + ALPAKA_FN_INLINE ALPAKA_FN_ACC auto atomicLoad(TAcc const& acc, T& target) { - ThreadFence::block(); + return alpaka::atomicCas(acc, &target, static_cast(0U), static_cast(0U)); } } // namespace mallocMC