diff --git a/src/include/mallocMC/allocator.hpp b/src/include/mallocMC/allocator.hpp
index 9fe1f41b..74e0f5c6 100644
--- a/src/include/mallocMC/allocator.hpp
+++ b/src/include/mallocMC/allocator.hpp
@@ -32,7 +32,6 @@
 #include "mallocMC_allocator_handle.hpp"
 #include "mallocMC_constraints.hpp"
 #include "mallocMC_traits.hpp"
-#include "mallocMC_utils.hpp"
 
 #include <alpaka/alpaka.hpp>
 
@@ -113,11 +112,11 @@ namespace mallocMC
         using uint32 = std::uint32_t;
 
     public:
-        using CreationPolicy = T_CreationPolicy;
         using DistributionPolicy = T_DistributionPolicy;
         using OOMPolicy = T_OOMPolicy;
         using ReservePoolPolicy = T_ReservePoolPolicy;
         using AlignmentPolicy = T_AlignmentPolicy;
+        using CreationPolicy = T_CreationPolicy::template AlignmentAwarePolicy<AlignmentPolicy>;
         using HeapInfoVector = std::vector<HeapInfo>;
         using DevAllocator = DeviceAllocator<CreationPolicy, DistributionPolicy, OOMPolicy, AlignmentPolicy>;
         using AllocatorHandle = AllocatorHandleImpl<Allocator>;
@@ -135,13 +134,7 @@ namespace mallocMC
          * @param size number of bytes
          */
         template<typename AlpakaDevice, typename AlpakaQueue>
-        ALPAKA_FN_HOST void
-        /* `volatile size_t size` is required to break clang optimizations which
-         * results into runtime errors. Observed in PIConGPU if size is known at
-         * compile time. The volatile workaround has no negative effects on the
-         * register usage in CUDA.
-         */
-        alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t volatile size)
+        ALPAKA_FN_HOST void alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t size)
         {
             void* pool = reservePolicy.setMemPool(dev, size);
             std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size);
diff --git a/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/src/include/mallocMC/creationPolicies/OldMalloc.hpp
index c75534a1..13ee173d 100644
--- a/src/include/mallocMC/creationPolicies/OldMalloc.hpp
+++ b/src/include/mallocMC/creationPolicies/OldMalloc.hpp
@@ -49,9 +49,12 @@ namespace mallocMC
             using uint32 = std::uint32_t;
 
         public:
+            template<typename T_AlignmentPolicy>
+            using AlignmentAwarePolicy = OldMalloc;
+
             static constexpr auto providesAvailableSlots = false;
 
-            template<typename AlignmentPolicy, typename AlpakaAcc>
+            template<typename AlpakaAcc>
             ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) const -> void*
             {
                 return ::malloc(static_cast<size_t>(bytes));
diff --git a/src/include/mallocMC/creationPolicies/Scatter.hpp b/src/include/mallocMC/creationPolicies/Scatter.hpp
index 0de95765..38117491 100644
--- a/src/include/mallocMC/creationPolicies/Scatter.hpp
+++ b/src/include/mallocMC/creationPolicies/Scatter.hpp
@@ -34,11 +34,11 @@
 #pragma once
 
 #include "../mallocMC_utils.hpp"
-#include "Scatter.hpp"
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/intrinsic/Traits.hpp>
+#include <alpaka/mem/fence/Traits.hpp>
 
-#include <algorithm>
 #include <atomic>
 #include <cassert>
 #include <cstdint> /* uint32_t */
@@ -47,6 +47,7 @@
 #include <limits>
 #include <stdexcept>
 #include <string>
+#include <type_traits>
 
 namespace mallocMC
 {
@@ -106,10 +107,16 @@ namespace mallocMC
          */
         template<
             class T_Config = ScatterConf::DefaultScatterConfig,
-            class T_Hashing = ScatterConf::DefaultScatterHashingParams>
-        class Scatter
+            class T_Hashing = ScatterConf::DefaultScatterHashingParams,
+            class T_AlignmentPolicy = void>
+        class ScatterImpl
         {
         public:
+            // TODO(lenz): This is a bit of a round trip due to a change of interface. A larger refactoring should
+            // remove this again.
+            template<class T>
+            using AlignmentAwarePolicy = ScatterImpl<T_Config, T_Hashing, T>;
+
             using HeapProperties = T_Config;
             using HashingProperties = T_Hashing;
 
@@ -294,7 +301,8 @@ namespace mallocMC
              * @param spots number of bits that can be used
              * @return next free spot in the bitfield
              */
-            static ALPAKA_FN_ACC inline auto nextspot(uint32 bitfield, uint32 spot, uint32 spots) -> uint32
+            static ALPAKA_FN_ACC inline auto nextspot(auto const& acc, uint32 bitfield, uint32 spot, uint32 spots)
+                -> uint32
             {
                 uint32 const low_part = (spot + 1) == sizeof(uint32) * CHAR_BIT ? 0u : (bitfield >> (spot + 1));
                 uint32 const high_part = (bitfield << (spots - (spot + 1)));
@@ -302,7 +310,7 @@ namespace mallocMC
                 // wrap around the bitfields from the current spot to the left
                 bitfield = (high_part | low_part) & selection_mask;
                 // compute the step from the current spot in the bitfield
-                uint32 const step = ffs(~bitfield);
+                uint32 const step = alpaka::ffs(acc, static_cast<std::make_signed_t<decltype(bitfield)>>(~bitfield));
                 // and return the new spot
                 return (spot + step) % spots;
             }
@@ -344,9 +352,9 @@ namespace mallocMC
                     // note: popc(old) == spots should be sufficient,
                     // but if someone corrupts the memory we end up in an
                     // endless loop in here...
-                    if(popc(old) >= spots)
+                    if(alpaka::popcount(acc, old) >= static_cast<int32_t>(spots))
                         return -1;
-                    spot = nextspot(old, spot, spots);
+                    spot = nextspot(acc, old, spot, spots);
                 }
             }
 
@@ -376,10 +384,10 @@ namespace mallocMC
                 if(fullsegments != 32)
                     return alpaka::math::min(
                         acc,
-                        31,
+                        31U,
                         alpaka::math::max(
                             acc,
-                            0,
+                            0U,
                             (int) pagesize - (int) fullsegments * segmentsize - (int) sizeof(uint32))
                             / chunksize);
                 else
@@ -410,8 +418,8 @@ namespace mallocMC
                 uint32 spot = randInit() % segments;
                 uint32 const mask = _ptes[page].bitmask;
                 if((mask & (1u << spot)) != 0)
-                    spot = nextspot(mask, spot, segments);
-                uint32 const tries = segments - popc(mask);
+                    spot = nextspot(acc, mask, spot, segments);
+                uint32 const tries = segments - alpaka::popcount(acc, mask);
                 uint32* onpagemasks = onPageMasksPosition(page, segments);
                 for(uint32 i = 0; i < tries; ++i)
                 {
@@ -419,7 +427,7 @@ namespace mallocMC
                     if(hspot != -1)
                         return _page[page].data + (32 * spot + hspot) * chunksize;
                     alpaka::atomicOp<alpaka::AtomicOr>(acc, (uint32*) &_ptes[page].bitmask, 1u << spot);
-                    spot = nextspot(mask, spot, segments);
+                    spot = nextspot(acc, mask, spot, segments);
                 }
                 return 0;
             }
@@ -542,17 +550,17 @@ namespace mallocMC
              * @return pointer to a free chunk on a page, 0 if we were unable to
              * obtain a free chunk
              */
-            template<typename AlignmentPolicy, typename AlpakaAcc>
+            template<typename AlpakaAcc>
             ALPAKA_FN_ACC auto allocChunked(AlpakaAcc const& acc, uint32 bytes) -> void*
             {
                 // use the minimal allocation size to increase the hit rate for small allocations.
-                uint32 const paddedMinChunkSize = AlignmentPolicy::applyPadding(minChunkSize);
+                uint32 const paddedMinChunkSize = T_AlignmentPolicy::applyPadding(minChunkSize);
                 uint32 const minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize);
                 uint32 const numpages = _numpages;
                 uint32 const pagesperblock = numpages / _accessblocks;
-                uint32 const reloff = warpSize * minAllocation / pagesize;
-                uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid()
-                                                    + (hashingDistWP + hashingDistWPRel * reloff) * warpid())
+                uint32 const reloff = warpSize<AlpakaAcc> * minAllocation / pagesize;
+                uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid(acc)
+                                                    + (hashingDistWP + hashingDistWPRel * reloff) * warpid(acc))
                                                    % pagesperblock;
                 uint32 const maxchunksize = alpaka::math::min(
                     acc,
@@ -687,7 +695,7 @@ namespace mallocMC
                             /** Take care that the meta data changes where we did not use atomics are propagated to all
                              * other threads.
                              */
-                            threadfenceDevice(acc);
+                            alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
                             /* Remove chunk information.
                              * It is important that this call happened after page init is called because scatter malloc
                              * is updating the chunksize without notify the action by increasing the page count
@@ -742,8 +750,9 @@ namespace mallocMC
                     // mark it as free
                     uint32 const nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0);
                     uint32* onpagemasks = onPageMasksPosition(page, nMasks);
-                    uint32 old
-                        = alpaka::atomicOp<alpaka::AtomicAnd>(acc, &onpagemasks[segment], ~(1u << withinsegment));
+                    /* currently unchecked:
+                     * uint32 old = */
+                    alpaka::atomicOp<alpaka::AtomicAnd>(acc, &onpagemasks[segment], ~(1u << withinsegment));
 
                     // always do this, since it might fail due to a
                     // race-condition with addChunkHierarchy
@@ -768,7 +777,7 @@ namespace mallocMC
                     alpaka::atomicOp<alpaka::AtomicExch>(acc, (uint32*) (_regions + region), 0u);
                     uint32 const pagesperblock = _numpages / _accessblocks;
                     uint32 const block = page / pagesperblock;
-                    if(warpid() + laneid() == 0)
+                    if(warpid(acc) + laneid() == 0)
                         alpaka::atomicOp<alpaka::AtomicMin>(acc, (uint32*) &_firstfreeblock, block);
                 }
             }
@@ -817,7 +826,7 @@ namespace mallocMC
                 uint32 endpage,
                 uint32 bytes) -> void*
             {
-                uint32 const pagestoalloc = divup(bytes, pagesize);
+                uint32 const pagestoalloc = ceilingDivision(bytes, pagesize);
                 uint32 freecount = 0;
                 bool left_free = false;
                 for(uint32 search_page = startpage + 1; search_page > endpage;)
@@ -893,11 +902,11 @@ namespace mallocMC
                 // only one thread per warp can acquire the mutex
                 void* res = 0;
                 // based on the alpaka backend the lanemask type can be 64bit
-                auto const mask = activemask();
-                uint32_t const num = popc(mask);
+                auto const mask = alpaka::warp::activemask(acc);
+                uint32_t const num = alpaka::popcount(acc, mask);
                 // based on the alpaka backend the lanemask type can be 64bit
-                auto const lanemask = lanemask_lt();
-                uint32_t const local_id = popc(lanemask & mask);
+                auto const lanemask = lanemask_lt(acc);
+                uint32_t const local_id = alpaka::popcount(acc, lanemask & mask);
                 for(unsigned int active = 0; active < num; ++active)
                     if(active == local_id)
                         res = allocPageBasedSingle(acc, bytes);
@@ -913,11 +922,11 @@ namespace mallocMC
             template<typename AlpakaAcc>
             ALPAKA_FN_ACC void deallocPageBased(AlpakaAcc const& acc, void* mem, uint32 page, uint32 bytes)
             {
-                uint32 const pages = divup(bytes, pagesize);
+                uint32 const pages = ceilingDivision(bytes, pagesize);
                 for(uint32 p = page; p < page + pages; ++p)
                     _page[p].init();
 
-                threadfenceDevice(acc);
+                alpaka::mem_fence(acc, alpaka::memory_scope::Device{});
 
                 for(uint32 p = page; p < page + pages; ++p)
                     alpaka::atomicOp<alpaka::AtomicCas>(acc, (uint32*) &_ptes[p].chunksize, bytes, 0u);
@@ -931,7 +940,7 @@ namespace mallocMC
              * @param bytes number of bytes to allocate
              * @return pointer to the allocated memory
              */
-            template<typename AlignmentPolicy, typename AlpakaAcc>
+            template<typename AlpakaAcc>
             ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) -> void*
             {
                 if(bytes == 0)
@@ -944,7 +953,7 @@ namespace mallocMC
                  */
                 if(bytes <= pagesize)
                     // chunck based
-                    return allocChunked<AlignmentPolicy>(acc, bytes);
+                    return allocChunked(acc, bytes);
                 else
                     // allocate a range of pages
                     return allocPageBased(acc, bytes);
@@ -1008,23 +1017,6 @@ namespace mallocMC
                 uint32 numpages = numregions * regionsize;
                 // pointer is copied (copy is called page)
                 Page* page = (Page*) memory;
-                // sec check for alignment
-                // copy is checked
-                // PointerEquivalent alignmentstatus = ((PointerEquivalent)page)
-                // & (16 -1); if(alignmentstatus != 0)
-                //{
-                //  if(linid == 0){
-                //    printf("c Before:\n");
-                //    printf("c dataAlignment:   %d\n",16);
-                //    printf("c Alignmentstatus: %d\n",alignmentstatus);
-                //    printf("c size_t memsize   %llu byte\n", memsize);
-                //    printf("c void *memory     %p\n", page);
-                //  }
-                //  //copy is adjusted, potentially pointer to higher address
-                //  now. page =(Page*)(((PointerEquivalent)page) + 16 -
-                //  alignmentstatus); if(linid == 0) printf("c Heap Warning:
-                //  memory to use not 16 byte aligned...\n");
-                //}
 
                 // We have to calculate these values here, before using them for other things.
                 // First calculate how many blocks of the given size fit our memory pages in principle.
@@ -1056,16 +1048,6 @@ namespace mallocMC
                 ptes = (PTE*) (page + numpages);
                 regions = (uint32*) (ptes + numpages);
 
-                // if(linid == 0) printf("Heap info: wasting %d
-                // bytes\n",(((POINTEREQUIVALENT)memory) + memsize) -
-                // (POINTEREQUIVALENT)(regions + numregions));
-
-                // if(linid == 0 && alignmentstatus != 0){
-                //  printf("c Was shrinked automatically to:\n");
-                //  printf("c size_t memsize   %llu byte\n", memsize);
-                //  printf("c void *memory     %p\n", page);
-                //}
-
                 for(uint32 i = linid; i < numpages; i += totalThreads)
                 {
                     ptes[i].init();
@@ -1116,11 +1098,7 @@ namespace mallocMC
                                       AlpakaAcc const& m_acc,
                                       T_DeviceAllocator* m_heap,
                                       void* m_heapmem,
-                                      size_t m_memsize)
-                {
-                    m_heap->pool = m_heapmem;
-                    m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize);
-                };
+                                      size_t m_memsize) { m_heap->initDeviceFunction(m_acc, m_heapmem, m_memsize); };
                 using Dim = typename alpaka::trait::DimType<AlpakaAcc>::type;
                 using Idx = typename alpaka::trait::IdxType<AlpakaAcc>::type;
                 using VecType = alpaka::Vec<Dim, Idx>;
@@ -1199,7 +1177,7 @@ namespace mallocMC
              * @param stride the stride should be equal to the number of
              * different gids (and therefore of value max(gid)-1)
              */
-            template<typename AlignmentPolicy, typename AlpakaAcc>
+            template<typename AlpakaAcc>
             ALPAKA_FN_ACC auto getAvailaibleSlotsDeviceFunction(
                 AlpakaAcc const& acc,
                 size_t slotSize,
@@ -1223,7 +1201,7 @@ namespace mallocMC
                             chunksize = alpaka::math::max(
                                 acc,
                                 (uint32) slotSize,
-                                AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size
+                                T_AlignmentPolicy::applyPadding(minChunkSize)); // ensure minimum chunk size
                             slotcount += countFreeChunksInPage(
                                 acc,
                                 currentpage,
@@ -1240,7 +1218,7 @@ namespace mallocMC
                 { // 1 slot needs multiple pages
                     if(gid > 0)
                         return 0; // do this serially
-                    uint32 const pagestoalloc = divup((uint32) slotSize, pagesize);
+                    uint32 const pagestoalloc = ceilingDivision((uint32) slotSize, pagesize);
                     uint32 freecount = 0;
                     for(uint32 currentpage = _numpages; currentpage > 0;)
                     { // this already includes all superblocks
@@ -1296,8 +1274,8 @@ namespace mallocMC
                     auto const gid = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc).sum();
 
                     auto const nWorker = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc).prod();
-                    unsigned const temp = heapPtr->template getAvailaibleSlotsDeviceFunction<
-                        typename T_DeviceAllocator::AlignmentPolicy>(acc, numBytes, gid, nWorker);
+                    unsigned const temp
+                        = heapPtr->template getAvailaibleSlotsDeviceFunction(acc, numBytes, gid, nWorker);
                     if(temp)
                         alpaka::atomicOp<alpaka::AtomicAdd>(acc, slots, temp);
                 };
@@ -1354,21 +1332,22 @@ namespace mallocMC
              *
              * @param slotSize the size of allocatable elements to count
              */
-            template<typename AlignmentPolicy, typename AlpakaAcc>
+            template<typename AlpakaAcc>
             ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(AlpakaAcc const& acc, size_t slotSize) -> unsigned
             {
                 int const wId = warpid_withinblock(acc); // do not use warpid-function, since
                                                          // this value is not guaranteed to
                                                          // be stable across warp lifetime
 
-                uint32 const activeThreads = popc(activemask());
+                uint32 const activeThreads = alpaka::popcount(acc, alpaka::warp::activemask(acc));
 
+                constexpr auto warpsize = warpSize<AlpakaAcc>;
                 auto& activePerWarp = alpaka::declareSharedVar<
-                    std::uint32_t[maxThreadsPerBlock / warpSize],
+                    std::uint32_t[maxThreadsPerBlock / warpsize],
                     __COUNTER__>(acc); // maximum number of warps in a block
 
                 auto& warpResults
-                    = alpaka::declareSharedVar<unsigned[maxThreadsPerBlock / warpSize], __COUNTER__>(acc);
+                    = alpaka::declareSharedVar<unsigned[maxThreadsPerBlock / warpSize<AlpakaAcc>], __COUNTER__>(acc);
 
                 warpResults[wId] = 0;
                 activePerWarp[wId] = 0;
@@ -1383,16 +1362,13 @@ namespace mallocMC
 
                 // printf("Block %d, id %d: activeThreads=%d
                 // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId);
-                unsigned const temp = this->template getAvailaibleSlotsDeviceFunction<AlignmentPolicy>(
-                    acc,
-                    slotSize,
-                    linearId,
-                    activeThreads);
+                unsigned const temp
+                    = this->template getAvailaibleSlotsDeviceFunction(acc, slotSize, linearId, activeThreads);
                 if(temp)
                     alpaka::atomicOp<alpaka::AtomicAdd>(acc, &warpResults[wId], temp);
 
                 alpaka::syncBlockThreads(acc);
-                threadfenceBlock(acc);
+                alpaka::mem_fence(acc, alpaka::memory_scope::Block{});
 
                 return warpResults[wId];
             }
@@ -1416,5 +1392,12 @@ namespace mallocMC
             }
         };
 
+        template<typename T, typename U = ScatterConf::DefaultScatterHashingParams>
+        struct Scatter
+        {
+            template<typename V>
+            using AlignmentAwarePolicy = ScatterImpl<T, U, V>;
+        };
+
     } // namespace CreationPolicies
 } // namespace mallocMC
diff --git a/src/include/mallocMC/device_allocator.hpp b/src/include/mallocMC/device_allocator.hpp
index 52e4e736..0f6fe090 100644
--- a/src/include/mallocMC/device_allocator.hpp
+++ b/src/include/mallocMC/device_allocator.hpp
@@ -2,10 +2,11 @@
   mallocMC: Memory Allocator for Many Core Architectures.
   https://www.hzdr.de/crp
 
-  Copyright 2014 - 2015 Institute of Radiation Physics,
+  Copyright 2014 - 2024 Institute of Radiation Physics,
                         Helmholtz-Zentrum Dresden - Rossendorf
 
   Author(s):  Carlchristian Eckert - c.eckert ( at ) hzdr.de
+              Julian J. Lenz - j.lenz ( at ) hzdr.de
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -28,9 +29,7 @@
 
 #pragma once
 
-#include "mallocMC_constraints.hpp"
 #include "mallocMC_traits.hpp"
-#include "mallocMC_utils.hpp"
 
 #include <alpaka/core/Common.hpp>
 
@@ -58,7 +57,7 @@ namespace mallocMC
         typename T_DistributionPolicy,
         typename T_OOMPolicy,
         typename T_AlignmentPolicy>
-    class DeviceAllocator : public T_CreationPolicy
+    class DeviceAllocator : public T_CreationPolicy::template AlignmentAwarePolicy<T_AlignmentPolicy>
     {
         using uint32 = std::uint32_t;
 
@@ -68,24 +67,31 @@ namespace mallocMC
         using OOMPolicy = T_OOMPolicy;
         using AlignmentPolicy = T_AlignmentPolicy;
 
-        void* pool;
-
         template<typename AlpakaAcc>
         ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t bytes) -> void*
         {
+            if(bytes == 0U)
+            {
+                return nullptr;
+            }
             bytes = AlignmentPolicy::applyPadding(bytes);
             DistributionPolicy distributionPolicy(acc);
             uint32 const req_size = distributionPolicy.collect(acc, bytes);
-            void* memBlock = CreationPolicy::template create<AlignmentPolicy>(acc, req_size);
+            void* memBlock = CreationPolicy::template AlignmentAwarePolicy<T_AlignmentPolicy>::create(acc, req_size);
             if(CreationPolicy::isOOM(memBlock, req_size))
+            {
                 memBlock = OOMPolicy::handleOOM(memBlock);
+            }
             return distributionPolicy.distribute(acc, memBlock);
         }
 
         template<typename AlpakaAcc>
-        ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* p)
+        ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* pointer)
         {
-            CreationPolicy::destroy(acc, p);
+            if(pointer != nullptr)
+            {
+                CreationPolicy::template AlignmentAwarePolicy<T_AlignmentPolicy>::destroy(acc, pointer);
+            }
         }
 
         /** Provide the number of available free slots.
@@ -101,9 +107,15 @@ namespace mallocMC
         {
             slotSize = AlignmentPolicy::applyPadding(slotSize);
             if constexpr(Traits<DeviceAllocator>::providesAvailableSlots)
-                return CreationPolicy::template getAvailableSlotsAccelerator<AlignmentPolicy>(acc, slotSize);
+            {
+                return CreationPolicy::template AlignmentAwarePolicy<T_AlignmentPolicy>::getAvailableSlotsAccelerator(
+                    acc,
+                    slotSize);
+            }
             else
-                return 0u;
+            {
+                return 0U;
+            }
         }
     };
 
diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
index eb4f3d59..fbfdd2d3 100644
--- a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
+++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp
@@ -37,6 +37,7 @@
 #include "XMallocSIMD.hpp"
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/warp/Traits.hpp>
 
 #include <cstdint>
 #include <limits>
@@ -125,7 +126,8 @@ namespace mallocMC
 
                 // init with initial counter
                 auto& warp_sizecounter
-                    = alpaka::declareSharedVar<std::uint32_t[maxThreadsPerBlock / warpSize], __COUNTER__>(acc);
+                    = alpaka::declareSharedVar<std::uint32_t[maxThreadsPerBlock / warpSize<AlpakaAcc>()], __COUNTER__>(
+                        acc);
                 warp_sizecounter[warpid] = 16;
 
                 // second half: make sure that all coalesced allocations can fit
@@ -133,7 +135,7 @@ namespace mallocMC
                 bool const coalescible = bytes > 0 && bytes < (pagesize / 32);
 
 #if(MALLOCMC_DEVICE_COMPILE)
-                threadcount = popc(ballot(coalescible));
+                threadcount = alpaka::popcount(alpaka::warp::ballot(acc, coalescible));
 #else
                 threadcount = 1; // TODO
 #endif
@@ -153,7 +155,8 @@ namespace mallocMC
             template<typename AlpakaAcc>
             ALPAKA_FN_ACC auto distribute(AlpakaAcc const& acc, void* allocatedMem) -> void*
             {
-                auto& warp_res = alpaka::declareSharedVar<char * [maxThreadsPerBlock / warpSize], __COUNTER__>(acc);
+                auto& warp_res
+                    = alpaka::declareSharedVar<char * [maxThreadsPerBlock / warpSize<AlpakaAcc>()], __COUNTER__>(acc);
 
                 char* myalloc = (char*) allocatedMem;
                 if(req_size && can_use_coalescing)
diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp
index c1c9b24f..ad43eb49 100644
--- a/src/include/mallocMC/mallocMC_utils.hpp
+++ b/src/include/mallocMC/mallocMC_utils.hpp
@@ -5,12 +5,13 @@
 
   Copyright (C) 2012 Institute for Computer Graphics and Vision,
                      Graz University of Technology
-  Copyright (C) 2014 Institute of Radiation Physics,
+  Copyright (C) 2014-2024 Institute of Radiation Physics,
                      Helmholtz-Zentrum Dresden - Rossendorf
 
   Author(s):  Markus Steinberger - steinberger ( at ) icg.tugraz.at
               Michael Kenzel - kenzel ( at ) icg.tugraz.at
               Carlchristian Eckert - c.eckert ( at ) hzdr.de
+              Julian Lenz - j.lenz ( at ) hzdr.de
 
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
@@ -34,16 +35,15 @@
 #pragma once
 
 #include <alpaka/alpaka.hpp>
+#include <alpaka/core/Common.hpp>
+
+#include <sys/types.h>
 
 #ifdef _MSC_VER
 #    include <intrin.h>
 #endif
 
-#include <atomic>
 #include <cstdint>
-#include <sstream>
-#include <stdexcept>
-#include <string>
 #include <type_traits>
 
 /* HIP-clang is doing something wrong and uses the host path of the code when __HIP_DEVICE_COMPILE__
@@ -56,39 +56,25 @@
 
 namespace mallocMC
 {
-    template<int PSIZE>
-    class __PointerEquivalent
-    {
-    public:
-        using type = unsigned int;
-    };
 
-    template<>
-    class __PointerEquivalent<8>
-    {
-    public:
-        using type = unsigned long long;
-    };
+    template<typename TAcc>
+    constexpr uint32_t warpSize = 1U;
 
-#if defined(__CUDA_ARCH__)
-    constexpr auto warpSize = 32; // TODO
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-// defined:
-// https://github.com/llvm/llvm-project/blob/62ec4ac90738a5f2d209ed28c822223e58aaaeb7/clang/lib/Basic/Targets/AMDGPU.cpp#L400
-// overview wave front size:
-// https://github.com/llvm/llvm-project/blob/efc063b621ea0c4d1e452bcade62f7fc7e1cc937/clang/test/Driver/amdgpu-macros.cl#L70-L115
-// gfx10XX has 32 threads per wavefront else 64
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    constexpr uint32_t warpSize<alpaka::AccGpuCudaRt<TDim, TIdx>> = 32U;
+#endif
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
 #    if(HIP_VERSION_MAJOR >= 4)
-    constexpr auto warpSize = __AMDGCN_WAVEFRONT_SIZE;
+    template<typename TDim, typename TIdx>
+    constexpr uint32_t warpSize<alpaka::AccGpuHipRt<TDim, TIdx>> = __AMDGCN_WAVEFRONT_SIZE;
 #    else
-    constexpr auto warpSize = 64;
+    template<typename TDim, typename TIdx>
+    constexpr uint32_t warpSize<alpaka::AccGpuHipRt<TDim, TIdx>> = 64;
 #    endif
-#else
-    constexpr auto warpSize = 1;
 #endif
 
-    using PointerEquivalent = mallocMC::__PointerEquivalent<sizeof(char*)>::type;
-
     ALPAKA_FN_ACC inline auto laneid()
     {
 #if defined(__CUDA_ARCH__)
@@ -98,7 +84,7 @@ namespace mallocMC
 #elif defined(__HIP_DEVICE_COMPILE__) && defined(__HIP__)
         return __lane_id();
 #else
-        return 0u;
+        return 0U;
 #endif
     }
 
@@ -109,82 +95,87 @@ namespace mallocMC
      *
      * @return current index of the warp
      */
-    ALPAKA_FN_ACC inline auto warpid()
+    template<typename TAcc>
+    ALPAKA_FN_ACC inline auto warpid(TAcc const& /*acc*/) -> uint32_t
     {
-#if defined(__CUDA_ARCH__)
-        std::uint32_t mywarpid;
+        return 0U;
+    }
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    // ALPAKA_FN_ACC resolves to `__host__ __device__` if we're not in CUDA_ONLY_MODE. But the assembly instruction is
+    // specific to the device and cannot be compiled on the host. So, we need an explicit `__device__` here.`
+    inline __device__ auto warpid(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/) -> uint32_t
+    {
+        std::uint32_t mywarpid = 0;
         asm("mov.u32 %0, %%warpid;" : "=r"(mywarpid));
         return mywarpid;
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
+    }
+#endif
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto warpid(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/) -> uint32_t
+    {
         // get wave id
         // https://github.com/ROCm-Developer-Tools/HIP/blob/f72a669487dd352e45321c4b3038f8fe2365c236/include/hip/hcc_detail/device_functions.h#L974-L1024
         return __builtin_amdgcn_s_getreg(GETREG_IMMED(3, 0, 4));
-#else
-        return 0u;
+    }
 #endif
+
+    template<typename TAcc>
+    ALPAKA_FN_ACC inline auto smid(TAcc const& /*acc*/) -> uint32_t
+    {
+        return 0U;
     }
 
-    ALPAKA_FN_ACC inline auto smid()
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/) -> uint32_t
     {
-#if defined(__CUDA_ARCH__)
-        std::uint32_t mysmid;
+        std::uint32_t mysmid = 0;
         asm("mov.u32 %0, %%smid;" : "=r"(mysmid));
         return mysmid;
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        return __smid();
-#else
-        return 0u;
+    }
 #endif
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto smid(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/) -> uint32_t
+    {
+        return __smid();
     }
+#endif
 
-    ALPAKA_FN_ACC inline auto lanemask_lt()
+    template<typename TAcc>
+    ALPAKA_FN_ACC inline auto lanemask_lt(TAcc const& /*acc*/)
+    {
+        return 0U;
+    }
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuCudaRt<TDim, TIdx> const& /*acc*/)
     {
-#if defined(__CUDA_ARCH__)
         std::uint32_t lanemask;
         asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask));
         return lanemask;
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        return __lanemask_lt();
-#else
-        return 0u;
-#endif
     }
-
-    ALPAKA_FN_ACC inline auto ballot(int pred)
-    {
-#if defined(__CUDA_ARCH__)
-        return __ballot_sync(__activemask(), pred);
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return __ballot(pred);
-#else
-        return 1u;
 #endif
-    }
 
-    ALPAKA_FN_ACC inline auto activemask()
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+    template<typename TDim, typename TIdx>
+    ALPAKA_FN_ACC inline auto lanemask_lt(alpaka::AccGpuHipRt<TDim, TIdx> const& /*acc*/)
     {
-#if defined(__CUDA_ARCH__)
-        return __activemask();
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return ballot(1);
-#else
-        return 1u;
-#endif
+        return __lanemask_lt();
     }
+#endif
 
-    template<class T>
-    ALPAKA_FN_HOST_ACC inline auto divup(T a, T b) -> T
-    {
-        return (a + b - 1) / b;
-    }
 
     /** the maximal number threads per block, valid for sm_2.X - sm_7.5
      *
      * https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
      */
-    constexpr uint32_t maxThreadsPerBlock = 1024;
+    constexpr uint32_t maxThreadsPerBlock = 1024U;
 
     /** warp id within a cuda block
      *
@@ -199,96 +190,27 @@ namespace mallocMC
         auto const localId = alpaka::mapIdx<1>(
             alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc),
             alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc))[0];
-        return localId / warpSize;
+        return localId / warpSize<AlpakaAcc>;
     }
 
-    template<typename T>
-    ALPAKA_FN_ACC inline auto ffs(T mask) -> std::uint32_t
+    template<typename T, typename U, typename = std::enable_if_t<std::is_integral_v<T> && std::is_integral_v<U>>>
+    ALPAKA_FN_INLINE ALPAKA_FN_ACC constexpr auto ceilingDivision(T const numerator, U const denominator) -> T
     {
-#if defined(__CUDA_ARCH__)
-        return ::__ffs(mask);
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return ::__ffsll(static_cast<unsigned long long int>(mask));
-#else
-        if(mask == 0)
-            return 0;
-        auto i = 1u;
-        while((mask & 1) == 0)
-        {
-            mask >>= 1;
-            i++;
-        }
-        return i;
-#endif
-    }
-
-    template<typename T>
-    ALPAKA_FN_ACC inline auto popc(T mask) -> std::uint32_t
-    {
-#if defined(__CUDA_ARCH__)
-        return ::__popc(mask);
-#elif(MALLOCMC_DEVICE_COMPILE && BOOST_COMP_HIP)
-        // return value is 64bit for HIP-clang
-        return ::__popcll(static_cast<unsigned long long int>(mask));
-#else
-        // cf.
-        // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
-        std::uint32_t count = 0;
-        while(mask)
-        {
-            count++;
-            mask &= mask - 1;
-        }
-        return count;
-#endif
+        return (numerator + (denominator - 1)) / denominator;
     }
 
-    // Threadfence implementations will maybe moved later into alpaka
-    template<typename T_Acc, typename T_Sfinae = void>
-    struct ThreadFence
-    {
-        // CPU only implementation
-        static void device()
-        {
-            std::atomic_thread_fence(std::memory_order_seq_cst);
-        }
-
-        static void block()
-        {
-            std::atomic_thread_fence(std::memory_order_seq_cst);
-        }
-    };
-
-    template<typename... T_AccArgs>
-    struct ThreadFence<alpaka::AccGpuUniformCudaHipRt<T_AccArgs...>, void>
-    {
-        static ALPAKA_FN_ACC void device()
-        {
-#if MALLOCMC_DEVICE_COMPILE
-            __threadfence();
-#endif
-        }
-
-        static ALPAKA_FN_ACC void block()
-        {
-#if MALLOCMC_DEVICE_COMPILE
-            __threadfence_block();
-#endif
-        }
-    };
-
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T_Acc>
-    ALPAKA_FN_ACC void threadfenceDevice(T_Acc const& acc)
+    template<typename T_size>
+    ALPAKA_FN_INLINE ALPAKA_FN_ACC auto indexOf(
+        void const* const pointer,
+        void const* const start,
+        T_size const stepSize) -> std::make_signed_t<T_size>
     {
-        ThreadFence<T_Acc>::device();
+        return std::distance(reinterpret_cast<char const*>(start), reinterpret_cast<char const*>(pointer)) / stepSize;
     }
 
-    ALPAKA_NO_HOST_ACC_WARNING
-    template<typename T_Acc>
-    ALPAKA_FN_ACC void threadfenceBlock(T_Acc const& acc)
+    template<typename TAcc, typename T>
+    ALPAKA_FN_INLINE ALPAKA_FN_ACC auto atomicLoad(TAcc const& acc, T& target)
     {
-        ThreadFence<T_Acc>::block();
+        return alpaka::atomicCas(acc, &target, static_cast<T>(0U), static_cast<T>(0U));
     }
 } // namespace mallocMC