From 1fea13bb87f40673f0ad8fbfe18f1bbcb4e54ff4 Mon Sep 17 00:00:00 2001 From: Julian Lenz Date: Fri, 8 Nov 2024 09:49:59 +0100 Subject: [PATCH 1/2] Copy .clang-format from alpaka --- .clang-format | 110 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 27 deletions(-) diff --git a/.clang-format b/.clang-format index 63ec7485..7249ac3a 100644 --- a/.clang-format +++ b/.clang-format @@ -4,15 +4,16 @@ Standard: c++20 DisableFormat: false AccessModifierOffset: -4 AlignAfterOpenBracket: AlwaysBreak +AlignArrayOfStructures: None AlignConsecutiveAssignments: false AlignConsecutiveBitFields: false AlignConsecutiveDeclarations: false AlignConsecutiveMacros: false AlignEscapedNewlines: Right -AlignOperands: DontAlign -AlignTrailingComments: false +AlignOperands: Align +AlignTrailingComments: + Kind: Never AllowAllArgumentsOnNextLine: false -AllowAllConstructorInitializersOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: false AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: false @@ -26,59 +27,90 @@ AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false +BitFieldColonSpacing: Both +BreakAfterAttributes: Never BreakBeforeBinaryOperators: All BreakBeforeBraces: Allman +BreakBeforeConceptDeclarations: Always +BreakBeforeInlineASMColon: OnlyMultiline BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma BreakStringLiterals: true ColumnLimit: 119 -CommentPragmas: "^ COMMENT pragma:" +CommentPragmas: '^ COMMENT pragma:' CompactNamespaces: false -ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true -DeriveLineEnding: true DerivePointerAlignment: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true IncludeBlocks: Regroup -IncludeIsMainRegex: "(Test)?$" -IncludeIsMainSourceRegex: "" +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false IndentCaseBlocks: true IndentCaseLabels: false IndentExternBlock: AfterExternBlock IndentGotoLabels: true IndentPPDirectives: AfterHash +IndentRequiresClause: false IndentWidth: 4 IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: true +IntegerLiteralSeparator: + Binary: 4 + Decimal: 3 + DecimalMinDigits: 7 + Hex: 4 KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: "" -MacroBlockEnd: "" +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' MaxEmptyLinesToKeep: 2 NamespaceIndentation: All +PackConstructorInitializers: CurrentLine PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 # default made explicit here PenaltyBreakString: 1000 PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 # default made explicit here PenaltyReturnTypeOnItsOwnLine: 1000 PointerAlignment: Left +PPIndentWidth: -1 # follow IndentWidth +QualifierAlignment: Custom +QualifierOrder: ['friend', 'static', 'inline', 'constexpr', 'type', 'const', 'volatile', 'restrict'] +ReferenceAlignment: Pointer # follow PointerAlignment ReflowComments: true +RemoveBracesLLVM: false +RemoveSemicolon: false +RequiresClausePosition: WithPreceding +RequiresExpressionIndentation: OuterScope +ShortNamespaceLines: 0 SortIncludes: true -SortUsingDeclarations: true +SortUsingDeclarations: Lexicographic +SeparateDefinitionBlocks: Always SpaceAfterCStyleCast: true SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false +SpaceAroundPointerQualifiers: Default # follow PointerAlignment SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Never SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 @@ -86,24 +118,48 @@ SpacesInAngles: false SpacesInConditionalStatement: false SpacesInContainerLiterals: false SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 SpacesInParentheses: false SpacesInSquareBrackets: false -SpaceBeforeSquareBrackets: false TabWidth: 4 UseCRLF: false UseTab: Never -# Project specific options -- uncomment and modify as needed -#IncludeCategories: -# - Regex: '^"(llvm|llvm-c|clang|clang-c)/' -# Priority: 2 -# SortPriority: 0 -# - Regex: '^(<|"(gtest|gmock|isl|json)/)' -# Priority: 3 -# SortPriority: 0 -# - Regex: '.*' -# Priority: 1 -# SortPriority: 0 - -# Future options - not supported in clang-format 11 -# BitFieldColonSpacing: Both -# OperandAlignmentStyle: Align +# Project specific options +#AttributeMacros: [] +#ForEachMacros: [] +#IfMacros: [] +IncludeCategories: + # Local headers (in "") above all else + - Regex: '"([A-Za-z0-9.\/-_])+"' + Priority: 1 + # "alpaka/foo.hpp" after local headers (occur inside alpaka) + - Regex: '"alpaka/([A-Za-z0-9.\/-_])+"' + Priority: 2 + # after local headers (occur outside alpaka in examples and test) + - Regex: '' + Priority: 3 + # C++ standard library headers are the last group to be included + - Regex: '<([A-Za-z0-9\/-_])+>' + Priority: 5 + # Includes that made it this far are third-party headers and will be placed + # below alpaka's includes + - Regex: '<([A-Za-z0-9.\/-_])+>' + Priority: 4 +# Macros: [] +# NamespaceMacros: [] +StatementAttributeLikeMacros: + - 'ALPAKA_DEVICE_VOLATILE' + - 'ALPAKA_FN_ACC' + - 'ALPAKA_FN_EXTERN' + - 'ALPAKA_FN_HOST' + - 'ALPAKA_FN_HOST_ACC' + - 'ALPAKA_FN_INLINE' + - 'ALPAKA_STATIC_ACC_MEM_CONSTANT' + - 'ALPAKA_STATIC_ACC_MEM_GLOBAL' + - 'ALPAKA_UNROLL' + - 'ALPAKA_VECTORIZE_HINT' +#StatementMacros: [] +#TypenameMacros: [] +#WhitespaceSensitiveMacros: [] From fbefa18015414c70847476570f60b6570e2e377f Mon Sep 17 00:00:00 2001 From: Third Party Date: Fri, 8 Nov 2024 09:51:50 +0100 Subject: [PATCH 2/2] Run clang-format --- examples/mallocMC_example01.cpp | 44 ++-- examples/mallocMC_example03.cpp | 16 +- .../mallocMC/alignmentPolicies/Noop.hpp | 1 + .../mallocMC/alignmentPolicies/Shrink.hpp | 2 + src/include/mallocMC/allocator.hpp | 5 +- .../mallocMC/creationPolicies/OldMalloc.hpp | 5 +- .../mallocMC/creationPolicies/Scatter.hpp | 222 +++++++++--------- src/include/mallocMC/device_allocator.hpp | 9 +- .../mallocMC/distributionPolicies/Noop.hpp | 7 +- .../distributionPolicies/XMallocSIMD.hpp | 9 +- .../mallocMC/mallocMC_allocator_handle.hpp | 6 +- src/include/mallocMC/mallocMC_utils.hpp | 6 +- .../oOMPolicies/BadAllocException.hpp | 1 + .../mallocMC/oOMPolicies/ReturnNull.hpp | 1 + .../reservePoolPolicies/AlpakaBuf.hpp | 3 +- .../reservePoolPolicies/CudaSetLimits.hpp | 3 +- tests/dimensions.cpp | 55 ++--- tests/policies.cpp | 5 +- tests/verify_heap.cpp | 112 ++++----- tests/verify_heap_config.hpp | 1 + 20 files changed, 270 insertions(+), 243 deletions(-) diff --git a/examples/mallocMC_example01.cpp b/examples/mallocMC_example01.cpp index f64c99b0..c991f357 100644 --- a/examples/mallocMC_example01.cpp +++ b/examples/mallocMC_example01.cpp @@ -26,12 +26,14 @@ THE SOFTWARE. */ -#include #include #include + +#include + +#include #include #include -#include #include using Dim = alpaka::DimInt<1>; @@ -84,7 +86,7 @@ auto main() -> int constexpr auto length = 100; auto const platform = alpaka::Platform{}; - const auto dev = alpaka::getDevByIdx(platform, 0); + auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = alpaka::Queue{dev}; auto const devProps = alpaka::getAccDevProps(dev); @@ -103,13 +105,13 @@ auto main() -> int // create arrays of arrays on the device { auto createArrayPointers - = [] ALPAKA_FN_ACC(const Acc& acc, int x, int y, ScatterAllocator::AllocatorHandle allocHandle) + = [] ALPAKA_FN_ACC(Acc const& acc, int x, int y, ScatterAllocator::AllocatorHandle allocHandle) { arA = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y); arB = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y); arC = (int**) allocHandle.malloc(acc, sizeof(int*) * x * y); }; - const auto workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel( @@ -122,9 +124,9 @@ auto main() -> int // fill 2 of them all with ascending values { - auto fillArrays = [] ALPAKA_FN_ACC(const Acc& acc, int length, ScatterAllocator::AllocatorHandle allocHandle) + auto fillArrays = [] ALPAKA_FN_ACC(Acc const& acc, int length, ScatterAllocator::AllocatorHandle allocHandle) { - const auto id = alpaka::getIdx(acc)[0]; + auto const id = alpaka::getIdx(acc)[0]; arA[id] = (int*) allocHandle.malloc(acc, length * sizeof(int)); arB[id] = (int*) allocHandle.malloc(acc, length * sizeof(int)); @@ -136,7 +138,7 @@ auto main() -> int arB[id][i] = static_cast(id * length + i); } }; - const auto workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel(workDiv, fillArrays, length, scatterAlloc.getAllocatorHandle())); @@ -147,9 +149,9 @@ auto main() -> int { auto sumsBufferAcc = alpaka::allocBuf(dev, Idx{block * grid}); - auto addArrays = [] ALPAKA_FN_ACC(const Acc& acc, int length, int* sums) + auto addArrays = [] ALPAKA_FN_ACC(Acc const& acc, int length, int* sums) { - const auto id = alpaka::getIdx(acc)[0]; + auto const id = alpaka::getIdx(acc)[0]; sums[id] = 0; for(int i = 0; i < length; ++i) @@ -158,25 +160,25 @@ auto main() -> int sums[id] += arC[id][i]; } }; - const auto workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel(workDiv, addArrays, length, alpaka::getPtrNative(sumsBufferAcc))); auto const platformCPU = alpaka::Platform{}; - const auto hostDev = alpaka::getDevByIdx(platformCPU, 0); + auto const hostDev = alpaka::getDevByIdx(platformCPU, 0); auto sumsBufferHost = alpaka::allocBuf(hostDev, Idx{block * grid}); alpaka::memcpy(queue, sumsBufferHost, sumsBufferAcc, Idx{block * grid}); alpaka::wait(queue); - const auto* sumsPtr = alpaka::getPtrNative(sumsBufferHost); - const auto sum = std::accumulate(sumsPtr, sumsPtr + block * grid, size_t{0}); + auto const* sumsPtr = alpaka::getPtrNative(sumsBufferHost); + auto const sum = std::accumulate(sumsPtr, sumsPtr + block * grid, size_t{0}); std::cout << "The sum of the arrays on GPU is " << sum << '\n'; } - const auto n = static_cast(block * grid * length); - const auto gaussian = n * (n - 1); + auto const n = static_cast(block * grid * length); + auto const gaussian = n * (n - 1); std::cout << "The gaussian sum as comparison: " << gaussian << '\n'; /*constexpr*/ if(mallocMC::Traits::providesAvailableSlots) @@ -187,25 +189,25 @@ auto main() -> int } { - auto freeArrays = [] ALPAKA_FN_ACC(const Acc& acc, ScatterAllocator::AllocatorHandle allocHandle) + auto freeArrays = [] ALPAKA_FN_ACC(Acc const& acc, ScatterAllocator::AllocatorHandle allocHandle) { - const auto id = alpaka::getIdx(acc)[0]; + auto const id = alpaka::getIdx(acc)[0]; allocHandle.free(acc, arA[id]); allocHandle.free(acc, arB[id]); allocHandle.free(acc, arC[id]); }; - const auto workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{grid}, Idx{block}, Idx{1}}; alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, freeArrays, scatterAlloc.getAllocatorHandle())); } { - auto freeArrayPointers = [] ALPAKA_FN_ACC(const Acc& acc, ScatterAllocator::AllocatorHandle allocHandle) + auto freeArrayPointers = [] ALPAKA_FN_ACC(Acc const& acc, ScatterAllocator::AllocatorHandle allocHandle) { allocHandle.free(acc, arA); allocHandle.free(acc, arB); allocHandle.free(acc, arC); }; - const auto workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel(workDiv, freeArrayPointers, scatterAlloc.getAllocatorHandle())); diff --git a/examples/mallocMC_example03.cpp b/examples/mallocMC_example03.cpp index f2262b63..6551e0d7 100644 --- a/examples/mallocMC_example03.cpp +++ b/examples/mallocMC_example03.cpp @@ -26,12 +26,14 @@ THE SOFTWARE. */ -#include #include #include + +#include + +#include #include #include -#include #include #include @@ -75,14 +77,14 @@ ALPAKA_STATIC_ACC_MEM_GLOBAL int* arA = nullptr; struct ExampleKernel { - ALPAKA_FN_ACC void operator()(const Acc& acc, ScatterAllocator::AllocatorHandle allocHandle) const + ALPAKA_FN_ACC void operator()(Acc const& acc, ScatterAllocator::AllocatorHandle allocHandle) const { - const auto id = static_cast(alpaka::getIdx(acc)[0]); + auto const id = static_cast(alpaka::getIdx(acc)[0]); if(id == 0) arA = (int*) allocHandle.malloc(acc, sizeof(int) * 32); // wait the the malloc from thread zero is not changing the result for some threads alpaka::syncBlockThreads(acc); - const auto slots = allocHandle.getAvailableSlots(acc, 1); + auto const slots = allocHandle.getAvailableSlots(acc, 1); if(arA != nullptr) { arA[id] = id; @@ -101,14 +103,14 @@ struct ExampleKernel auto main() -> int { auto const platform = alpaka::Platform{}; - const auto dev = alpaka::getDevByIdx(platform, 0); + auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = alpaka::Queue{dev}; auto const devProps = alpaka::getAccDevProps(dev); unsigned const block = std::min(static_cast(32u), static_cast(devProps.m_blockThreadCountMax)); ScatterAllocator scatterAlloc(dev, queue, 1U * 1024U * 1024U * 1024U); // 1GB for device-side malloc - const auto workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{block}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{block}, Idx{1}}; alpaka::enqueue(queue, alpaka::createTaskKernel(workDiv, ExampleKernel{}, scatterAlloc.getAllocatorHandle())); std::cout << "Slots from Host: " << scatterAlloc.getAvailableSlots(dev, queue, 1) << '\n'; diff --git a/src/include/mallocMC/alignmentPolicies/Noop.hpp b/src/include/mallocMC/alignmentPolicies/Noop.hpp index 5b631d3c..ee176187 100644 --- a/src/include/mallocMC/alignmentPolicies/Noop.hpp +++ b/src/include/mallocMC/alignmentPolicies/Noop.hpp @@ -30,6 +30,7 @@ #include "Noop.hpp" #include + #include #include #include diff --git a/src/include/mallocMC/alignmentPolicies/Shrink.hpp b/src/include/mallocMC/alignmentPolicies/Shrink.hpp index 4fd01184..0eb495e9 100644 --- a/src/include/mallocMC/alignmentPolicies/Shrink.hpp +++ b/src/include/mallocMC/alignmentPolicies/Shrink.hpp @@ -34,6 +34,7 @@ #include "Shrink.hpp" #include + #include #include #include @@ -50,6 +51,7 @@ namespace mallocMC { using type = unsigned int; }; + template<> struct __PointerEquivalent<8> { diff --git a/src/include/mallocMC/allocator.hpp b/src/include/mallocMC/allocator.hpp index 51375588..9fe1f41b 100644 --- a/src/include/mallocMC/allocator.hpp +++ b/src/include/mallocMC/allocator.hpp @@ -35,6 +35,7 @@ #include "mallocMC_utils.hpp" #include + #include #include #include @@ -140,7 +141,7 @@ namespace mallocMC * compile time. The volatile workaround has no negative effects on the * register usage in CUDA. */ - alloc(AlpakaDevice& dev, AlpakaQueue& queue, volatile size_t size) + alloc(AlpakaDevice& dev, AlpakaQueue& queue, size_t volatile size) { void* pool = reservePolicy.setMemPool(dev, size); std::tie(pool, size) = AlignmentPolicy::alignPool(pool, size); @@ -173,7 +174,7 @@ namespace mallocMC /* forbid to copy the allocator */ ALPAKA_FN_HOST - Allocator(const Allocator&) = delete; + Allocator(Allocator const&) = delete; public: template diff --git a/src/include/mallocMC/creationPolicies/OldMalloc.hpp b/src/include/mallocMC/creationPolicies/OldMalloc.hpp index c8729d54..c75534a1 100644 --- a/src/include/mallocMC/creationPolicies/OldMalloc.hpp +++ b/src/include/mallocMC/creationPolicies/OldMalloc.hpp @@ -30,6 +30,7 @@ #include "OldMalloc.hpp" #include + #include namespace mallocMC @@ -51,13 +52,13 @@ namespace mallocMC static constexpr auto providesAvailableSlots = false; template - ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32 bytes) const -> void* + ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) const -> void* { return ::malloc(static_cast(bytes)); } template - ALPAKA_FN_ACC void destroy(const AlpakaAcc& /*acc*/, void* mem) const + ALPAKA_FN_ACC void destroy(AlpakaAcc const& /*acc*/, void* mem) const { ::free(mem); } diff --git a/src/include/mallocMC/creationPolicies/Scatter.hpp b/src/include/mallocMC/creationPolicies/Scatter.hpp index feeb4d3e..0de95765 100644 --- a/src/include/mallocMC/creationPolicies/Scatter.hpp +++ b/src/include/mallocMC/creationPolicies/Scatter.hpp @@ -36,8 +36,9 @@ #include "../mallocMC_utils.hpp" #include "Scatter.hpp" -#include #include + +#include #include #include #include /* uint32_t */ @@ -111,11 +112,13 @@ namespace mallocMC public: using HeapProperties = T_Config; using HashingProperties = T_Hashing; + struct Properties : HeapProperties , HashingProperties { }; + static constexpr auto providesAvailableSlots = true; private: @@ -164,6 +167,7 @@ namespace mallocMC private: #if _DEBUG || ANALYSEHEAP + public: #endif /* HierarchyThreshold defines the largest chunk size which can be stored in a segment with hierarchy. @@ -252,7 +256,7 @@ namespace mallocMC * volatile avoids that the data is changed within L1 Cache and therefore is hidden for other * threads. */ - volatile uint32* write = (uint32*) (data + pagesize - (int) (sizeof(uint32) * maxOnPageMasks)); + uint32 volatile* write = (uint32*) (data + pagesize - (int) (sizeof(uint32) * maxOnPageMasks)); while(write < (uint32*) (data + pagesize)) *write++ = 0; } @@ -261,14 +265,14 @@ namespace mallocMC // the data used by the allocator volatile PTE* _ptes; - volatile uint32* _regions; + uint32 volatile* _regions; Page* _page; size_t _memsize; uint32 _numpages; uint32 _accessblocks; uint32 _pagebasedMutex; - volatile uint32 _firstFreePageBased; - volatile uint32 _firstfreeblock; + uint32 volatile _firstFreePageBased; + uint32 volatile _firstfreeblock; /** * randInit should create an random offset which can be used @@ -292,13 +296,13 @@ namespace mallocMC */ static ALPAKA_FN_ACC inline auto nextspot(uint32 bitfield, uint32 spot, uint32 spots) -> uint32 { - const uint32 low_part = (spot + 1) == sizeof(uint32) * CHAR_BIT ? 0u : (bitfield >> (spot + 1)); - const uint32 high_part = (bitfield << (spots - (spot + 1))); - const uint32 selection_mask = spots == sizeof(uint32) * CHAR_BIT ? ~0 : ((1u << spots) - 1); + uint32 const low_part = (spot + 1) == sizeof(uint32) * CHAR_BIT ? 0u : (bitfield >> (spot + 1)); + uint32 const high_part = (bitfield << (spots - (spot + 1))); + uint32 const selection_mask = spots == sizeof(uint32) * CHAR_BIT ? ~0 : ((1u << spots) - 1); // wrap around the bitfields from the current spot to the left bitfield = (high_part | low_part) & selection_mask; // compute the step from the current spot in the bitfield - const uint32 step = ffs(~bitfield); + uint32 const step = ffs(~bitfield); // and return the new spot return (spot + step) % spots; } @@ -327,14 +331,14 @@ namespace mallocMC * otherwise -1 */ template - static ALPAKA_FN_ACC inline auto usespot(const AlpakaAcc& acc, uint32* bitfield, uint32 spots) -> int + static ALPAKA_FN_ACC inline auto usespot(AlpakaAcc const& acc, uint32* bitfield, uint32 spots) -> int { // get first spot uint32 spot = randInit() % spots; for(;;) { - const uint32 mask = 1u << spot; - const uint32 old = alpaka::atomicOp(acc, bitfield, mask); + uint32 const mask = 1u << spot; + uint32 const old = alpaka::atomicOp(acc, bitfield, mask); if((old & mask) == 0) return spot; // note: popc(old) == spots should be sufficient, @@ -364,7 +368,7 @@ namespace mallocMC */ template static ALPAKA_FN_ACC inline auto calcAdditionalChunks( - const AlpakaAcc& acc, + AlpakaAcc const& acc, uint32 fullsegments, uint32 segmentsize, uint32 chunksize) -> uint32 @@ -396,22 +400,22 @@ namespace mallocMC */ template ALPAKA_FN_ACC inline auto addChunkHierarchy( - const AlpakaAcc& acc, + AlpakaAcc const& acc, uint32 chunksize, uint32 fullsegments, uint32 additional_chunks, uint32 page) -> void* { - const uint32 segments = fullsegments + (additional_chunks > 0 ? 1 : 0); + uint32 const segments = fullsegments + (additional_chunks > 0 ? 1 : 0); uint32 spot = randInit() % segments; - const uint32 mask = _ptes[page].bitmask; + uint32 const mask = _ptes[page].bitmask; if((mask & (1u << spot)) != 0) spot = nextspot(mask, spot, segments); - const uint32 tries = segments - popc(mask); + uint32 const tries = segments - popc(mask); uint32* onpagemasks = onPageMasksPosition(page, segments); for(uint32 i = 0; i < tries; ++i) { - const int hspot = usespot(acc, &onpagemasks[spot], spot < fullsegments ? 32 : additional_chunks); + int const hspot = usespot(acc, &onpagemasks[spot], spot < fullsegments ? 32 : additional_chunks); if(hspot != -1) return _page[page].data + (32 * spot + hspot) * chunksize; alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, 1u << spot); @@ -431,12 +435,12 @@ namespace mallocMC */ template ALPAKA_FN_ACC inline auto addChunkNoHierarchy( - const AlpakaAcc& acc, + AlpakaAcc const& acc, uint32 chunksize, uint32 page, uint32 spots) -> void* { - const int spot = usespot(acc, (uint32*) &_ptes[page].bitmask, spots); + int const spot = usespot(acc, (uint32*) &_ptes[page].bitmask, spots); if(spot == -1) return 0; // that should be impossible :) return _page[page].data + spot * chunksize; @@ -453,7 +457,7 @@ namespace mallocMC */ template ALPAKA_FN_ACC inline auto tryUsePage( - const AlpakaAcc& acc, + AlpakaAcc const& acc, uint32 page, uint32 chunksize, T_ChunkSizeRangeCheck&& isChunkSizeInRange) -> void* @@ -461,7 +465,7 @@ namespace mallocMC void* chunk_ptr = nullptr; // increse the fill level - const uint32 filllevel = alpaka::atomicOp(acc, (uint32*) &(_ptes[page].count), 1u); + uint32 const filllevel = alpaka::atomicOp(acc, (uint32*) &(_ptes[page].count), 1u); // if resetfreedpages == false we do not need to re-check chunksize bool tryAllocMem = !resetfreedpages; @@ -477,7 +481,7 @@ namespace mallocMC * In case the page is now free (chunksize == 0) we acquire the new chunk size. * In cases where the page has already a chunksize we test if the chunksize fits our needs. */ - const uint32 oldChunksize = alpaka::atomicOp( + uint32 const oldChunksize = alpaka::atomicOp( acc, (uint32*) &_ptes[page].chunksize, 0u, @@ -501,16 +505,16 @@ namespace mallocMC { // more chunks than can be covered by the pte's single // bitfield can be used - const uint32 segmentsize = chunksize * 32 + sizeof(uint32); - const uint32 fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); - const uint32 additional_chunks + uint32 const segmentsize = chunksize * 32 + sizeof(uint32); + uint32 const fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); + uint32 const additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); if(filllevel < fullsegments * 32 + additional_chunks) chunk_ptr = addChunkHierarchy(acc, chunksize, fullsegments, additional_chunks, page); } else { - const uint32 chunksinpage = alpaka::math::min(acc, pagesize / chunksize, 32u); + uint32 const chunksinpage = alpaka::math::min(acc, pagesize / chunksize, 32u); if(filllevel < chunksinpage) chunk_ptr = addChunkNoHierarchy(acc, chunksize, page, chunksinpage); } @@ -539,18 +543,18 @@ namespace mallocMC * obtain a free chunk */ template - ALPAKA_FN_ACC auto allocChunked(const AlpakaAcc& acc, uint32 bytes) -> void* + ALPAKA_FN_ACC auto allocChunked(AlpakaAcc const& acc, uint32 bytes) -> void* { // use the minimal allocation size to increase the hit rate for small allocations. - const uint32 paddedMinChunkSize = AlignmentPolicy::applyPadding(minChunkSize); - const uint32 minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); - const uint32 numpages = _numpages; - const uint32 pagesperblock = numpages / _accessblocks; - const uint32 reloff = warpSize * minAllocation / pagesize; - const uint32 start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid() + uint32 const paddedMinChunkSize = AlignmentPolicy::applyPadding(minChunkSize); + uint32 const minAllocation = alpaka::math::max(acc, bytes, paddedMinChunkSize); + uint32 const numpages = _numpages; + uint32 const pagesperblock = numpages / _accessblocks; + uint32 const reloff = warpSize * minAllocation / pagesize; + uint32 const start_page_in_block = (minAllocation * hashingK + hashingDistMP * smid() + (hashingDistWP + hashingDistWPRel * reloff) * warpid()) - % pagesperblock; - const uint32 maxchunksize = alpaka::math::min( + % pagesperblock; + uint32 const maxchunksize = alpaka::math::min( acc, +pagesize, /* this clumping means that allocations of paddedMinChunkSize could have a waste exceeding the @@ -562,7 +566,7 @@ namespace mallocMC * - different for each thread to reduce memory read/write conflicts * - index calculated by the hash function */ - const uint32 global_start_page = start_page_in_block + _firstfreeblock * pagesperblock; + uint32 const global_start_page = start_page_in_block + _firstfreeblock * pagesperblock; uint32 checklevel = regionsize * 3 / 4; /* Finding a free segment is using a two step approach. @@ -583,9 +587,9 @@ namespace mallocMC */ do { - const uint32 region = global_page / regionsize; - const uint32 regionfilllevel = _regions[region]; - const uint32 region_offset = region * regionsize; + uint32 const region = global_page / regionsize; + uint32 const regionfilllevel = _regions[region]; + uint32 const region_offset = region * regionsize; if(regionfilllevel < checklevel) { uint32 page_in_region = global_page; @@ -634,7 +638,7 @@ namespace mallocMC // check if we jumped into the next access block if(global_page % pagesperblock == 0u) { - const uint32 access_block_id = global_page / pagesperblock; + uint32 const access_block_id = global_page / pagesperblock; // randomize the thread writing the info // Data races are not critical. if(access_block_id > _firstfreeblock) @@ -654,7 +658,7 @@ namespace mallocMC * The last thread reducing the page count to zero should call this method. */ template - ALPAKA_FN_ACC void tryCleanPage(const AlpakaAcc& acc, uint32 page) + ALPAKA_FN_ACC void tryCleanPage(AlpakaAcc const& acc, uint32 page) { if constexpr(resetfreedpages) { @@ -672,7 +676,7 @@ namespace mallocMC if(oldfilllevel == 0) { - const uint32 chunksize + uint32 const chunksize = alpaka::atomicOp(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u); // if chunksize == 0 than another thread cleaned the page already if(chunksize != 0) @@ -724,19 +728,19 @@ namespace mallocMC * @param chunksize the chunksize used for the page */ template - ALPAKA_FN_ACC void deallocChunked(const AlpakaAcc& acc, void* mem, uint32 page, uint32 chunksize) + ALPAKA_FN_ACC void deallocChunked(AlpakaAcc const& acc, void* mem, uint32 page, uint32 chunksize) { - const auto inpage_offset = static_cast((char*) mem - _page[page].data); + auto const inpage_offset = static_cast((char*) mem - _page[page].data); if(chunksize <= HierarchyThreshold) { // one more level in hierarchy - const uint32 segmentsize = chunksize * 32 + sizeof(uint32); - const uint32 fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); - const uint32 additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); - const uint32 segment = inpage_offset / (chunksize * 32); - const uint32 withinsegment = (inpage_offset - segment * (chunksize * 32)) / chunksize; + uint32 const segmentsize = chunksize * 32 + sizeof(uint32); + uint32 const fullsegments = alpaka::math::min(acc, 32u, pagesize / segmentsize); + uint32 const additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); + uint32 const segment = inpage_offset / (chunksize * 32); + uint32 const withinsegment = (inpage_offset - segment * (chunksize * 32)) / chunksize; // mark it as free - const uint32 nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); + uint32 const nMasks = fullsegments + (additional_chunks > 0 ? 1 : 0); uint32* onpagemasks = onPageMasksPosition(page, nMasks); uint32 old = alpaka::atomicOp(acc, &onpagemasks[segment], ~(1u << withinsegment)); @@ -747,7 +751,7 @@ namespace mallocMC } else { - const uint32 segment = inpage_offset / chunksize; + uint32 const segment = inpage_offset / chunksize; alpaka::atomicOp(acc, (uint32*) &_ptes[page].bitmask, ~(1u << segment)); } @@ -760,10 +764,10 @@ namespace mallocMC // many threads, so.. if(oldfilllevel == pagesize / 2 / chunksize) { - const uint32 region = page / regionsize; + uint32 const region = page / regionsize; alpaka::atomicOp(acc, (uint32*) (_regions + region), 0u); - const uint32 pagesperblock = _numpages / _accessblocks; - const uint32 block = page / pagesperblock; + uint32 const pagesperblock = _numpages / _accessblocks; + uint32 const block = page / pagesperblock; if(warpid() + laneid() == 0) alpaka::atomicOp(acc, (uint32*) &_firstfreeblock, block); } @@ -777,12 +781,12 @@ namespace mallocMC * @return true on success, false if one of the pages is not free */ template - ALPAKA_FN_ACC auto markpages(const AlpakaAcc& acc, uint32 startpage, uint32 pages, uint32 bytes) -> bool + ALPAKA_FN_ACC auto markpages(AlpakaAcc const& acc, uint32 startpage, uint32 pages, uint32 bytes) -> bool { uint32 abord = std::numeric_limits::max(); for(uint32 trypage = startpage; trypage < startpage + pages; ++trypage) { - const uint32 old + uint32 const old = alpaka::atomicOp(acc, (uint32*) &_ptes[trypage].chunksize, 0u, bytes); if(old != 0) { @@ -808,12 +812,12 @@ namespace mallocMC */ template ALPAKA_FN_ACC auto allocPageBasedSingleRegion( - const AlpakaAcc& acc, + AlpakaAcc const& acc, uint32 startpage, uint32 endpage, uint32 bytes) -> void* { - const uint32 pagestoalloc = divup(bytes, pagesize); + uint32 const pagestoalloc = divup(bytes, pagesize); uint32 freecount = 0; bool left_free = false; for(uint32 search_page = startpage + 1; search_page > endpage;) @@ -856,13 +860,13 @@ namespace mallocMC * function concurrently */ template - ALPAKA_FN_ACC auto allocPageBasedSingle(const AlpakaAcc& acc, uint32 bytes) -> void* + ALPAKA_FN_ACC auto allocPageBasedSingle(AlpakaAcc const& acc, uint32 bytes) -> void* { // acquire mutex while(alpaka::atomicOp(acc, &_pagebasedMutex, 1u) != 0) ; // search for free spot from the back - const uint32 spage = _firstFreePageBased; + uint32 const spage = _firstFreePageBased; void* res = allocPageBasedSingleRegion(acc, spage, 0, bytes); if(res == 0) // also check the rest of the pages @@ -872,6 +876,7 @@ namespace mallocMC alpaka::atomicOp(acc, &_pagebasedMutex, 0u); return res; } + /** * allocPageBased tries to allocate the demanded number of bytes on * a continues sequence of pages @@ -880,7 +885,7 @@ namespace mallocMC * use all the requested pages */ template - ALPAKA_FN_ACC auto allocPageBased(const AlpakaAcc& acc, uint32 bytes) -> void* + ALPAKA_FN_ACC auto allocPageBased(AlpakaAcc const& acc, uint32 bytes) -> void* { // this is rather slow, but we dont expect that to happen often // anyway @@ -888,11 +893,11 @@ namespace mallocMC // only one thread per warp can acquire the mutex void* res = 0; // based on the alpaka backend the lanemask type can be 64bit - const auto mask = activemask(); - const uint32_t num = popc(mask); + auto const mask = activemask(); + uint32_t const num = popc(mask); // based on the alpaka backend the lanemask type can be 64bit - const auto lanemask = lanemask_lt(); - const uint32_t local_id = popc(lanemask & mask); + auto const lanemask = lanemask_lt(); + uint32_t const local_id = popc(lanemask & mask); for(unsigned int active = 0; active < num; ++active) if(active == local_id) res = allocPageBasedSingle(acc, bytes); @@ -906,9 +911,9 @@ namespace mallocMC * @param bytes the number of bytes to be freed */ template - ALPAKA_FN_ACC void deallocPageBased(const AlpakaAcc& acc, void* mem, uint32 page, uint32 bytes) + ALPAKA_FN_ACC void deallocPageBased(AlpakaAcc const& acc, void* mem, uint32 page, uint32 bytes) { - const uint32 pages = divup(bytes, pagesize); + uint32 const pages = divup(bytes, pagesize); for(uint32 p = page; p < page + pages; ++p) _page[p].init(); @@ -927,7 +932,7 @@ namespace mallocMC * @return pointer to the allocated memory */ template - ALPAKA_FN_ACC auto create(const AlpakaAcc& acc, uint32 bytes) -> void* + ALPAKA_FN_ACC auto create(AlpakaAcc const& acc, uint32 bytes) -> void* { if(bytes == 0) return 0; @@ -950,28 +955,28 @@ namespace mallocMC * @param mempointer to the memory region to free */ template - ALPAKA_FN_ACC void destroy(const AlpakaAcc& acc, void* mem) + ALPAKA_FN_ACC void destroy(AlpakaAcc const& acc, void* mem) { if(mem == 0) return; // lets see on which page we are on - const auto page = static_cast(((char*) mem - (char*) _page) / pagesize); + auto const page = static_cast(((char*) mem - (char*) _page) / pagesize); /* Emulate atomic read. * In older implementations we read the chunksize without atomics which can result in data races. */ - const uint32 chunksize + uint32 const chunksize = alpaka::atomicOp(acc, (uint32*) &_ptes[page].chunksize, 0u, 0u); // is the pointer the beginning of a chunk? - const auto inpage_offset = static_cast((char*) mem - _page[page].data); - const uint32 block = inpage_offset / chunksize; - const uint32 inblockoffset = inpage_offset - block * chunksize; + auto const inpage_offset = static_cast((char*) mem - _page[page].data); + uint32 const block = inpage_offset / chunksize; + uint32 const inblockoffset = inpage_offset - block * chunksize; if(inblockoffset != 0) { uint32* counter = (uint32*) (_page[page].data + block * chunksize); // coalesced mem free - const uint32 old = alpaka::atomicOp(acc, counter, 1u); + uint32 const old = alpaka::atomicOp(acc, counter, 1u); if(old != 1) return; mem = (void*) counter; @@ -992,13 +997,13 @@ namespace mallocMC * @param memsize size of the memory in bytes */ template - ALPAKA_FN_ACC void initDeviceFunction(const AlpakaAcc& acc, void* memory, size_t memsize) + ALPAKA_FN_ACC void initDeviceFunction(AlpakaAcc const& acc, void* memory, size_t memsize) { - const auto linid = alpaka::getIdx(acc).sum(); - const auto totalThreads = alpaka::getWorkDiv(acc).prod(); + auto const linid = alpaka::getIdx(acc).sum(); + auto const totalThreads = alpaka::getWorkDiv(acc).prod(); uint32 numregions = ((unsigned long long) memsize) - / (((unsigned long long) regionsize) * (sizeof(PTE) + pagesize) + sizeof(uint32)); + / (((unsigned long long) regionsize) * (sizeof(PTE) + pagesize) + sizeof(uint32)); uint32 numpages = numregions * regionsize; // pointer is copied (copy is called page) @@ -1026,11 +1031,11 @@ namespace mallocMC // However, we do not have to use the exact requested block size. // So we redistribute actual memory between the chosen number of blocks // and ensure that all blocks have the same number of regions. - const auto memorysize = static_cast(numpages) * pagesize; - const auto numblocks = memorysize / accessblocksize; - const auto memoryperblock = memorysize / numblocks; - const auto pagesperblock = memoryperblock / pagesize; - const auto regionsperblock = pagesperblock / regionsize; + auto const memorysize = static_cast(numpages) * pagesize; + auto const numblocks = memorysize / accessblocksize; + auto const memoryperblock = memorysize / numblocks; + auto const pagesperblock = memoryperblock / pagesize; + auto const regionsperblock = pagesperblock / regionsize; numregions = numblocks * regionsperblock; numpages = numregions * regionsize; @@ -1108,7 +1113,7 @@ namespace mallocMC "or AlignmentPolicy."); } auto initKernel = [] ALPAKA_FN_ACC( - const AlpakaAcc& m_acc, + AlpakaAcc const& m_acc, T_DeviceAllocator* m_heap, void* m_heapmem, size_t m_memsize) @@ -1127,7 +1132,7 @@ namespace mallocMC threadsPerBlock[Dim::value - 1] = std::min(static_cast(256u), static_cast(devProps.m_blockThreadCountMax)); - const auto workDiv = alpaka::WorkDivMembers{ + auto const workDiv = alpaka::WorkDivMembers{ VecType::ones(), threadsPerBlock, VecType::ones()}; // Dim may be any dimension, but workDiv is 1D @@ -1149,26 +1154,26 @@ namespace mallocMC * page. */ template - ALPAKA_FN_ACC auto countFreeChunksInPage(const AlpakaAcc& acc, uint32 page, uint32 chunksize) -> unsigned + ALPAKA_FN_ACC auto countFreeChunksInPage(AlpakaAcc const& acc, uint32 page, uint32 chunksize) -> unsigned { - const uint32 filledChunks = _ptes[page].count; + uint32 const filledChunks = _ptes[page].count; if(chunksize <= HierarchyThreshold) { - const uint32 segmentsize = chunksize * 32 + sizeof(uint32); // each segment can hold 32 + uint32 const segmentsize = chunksize * 32 + sizeof(uint32); // each segment can hold 32 // 2nd-level chunks - const uint32 fullsegments = alpaka::math::min( + uint32 const fullsegments = alpaka::math::min( acc, 32u, pagesize / segmentsize); // there might be space for // more than 32 segments // with 32 2nd-level chunks - const uint32 additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); - const uint32 level2Chunks = fullsegments * 32 + additional_chunks; + uint32 const additional_chunks = calcAdditionalChunks(acc, fullsegments, segmentsize, chunksize); + uint32 const level2Chunks = fullsegments * 32 + additional_chunks; return level2Chunks - filledChunks; } else { - const uint32 chunksinpage = alpaka::math::min( + uint32 const chunksinpage = alpaka::math::min( acc, pagesize / chunksize, 32u); // without hierarchy, there can not be more than @@ -1196,7 +1201,7 @@ namespace mallocMC */ template ALPAKA_FN_ACC auto getAvailaibleSlotsDeviceFunction( - const AlpakaAcc& acc, + AlpakaAcc const& acc, size_t slotSize, uint32 gid, uint32 stride) -> unsigned @@ -1206,7 +1211,7 @@ namespace mallocMC { // multiple slots per page for(uint32 currentpage = gid; currentpage < _numpages; currentpage += stride) { - const uint32 maxchunksize = alpaka::math::min(acc, +pagesize, wastefactor * (uint32) slotSize); + uint32 const maxchunksize = alpaka::math::min(acc, +pagesize, wastefactor * (uint32) slotSize); uint32 chunksize = _ptes[currentpage].chunksize; if(chunksize >= slotSize && chunksize <= maxchunksize) @@ -1235,7 +1240,7 @@ namespace mallocMC { // 1 slot needs multiple pages if(gid > 0) return 0; // do this serially - const uint32 pagestoalloc = divup((uint32) slotSize, pagesize); + uint32 const pagestoalloc = divup((uint32) slotSize, pagesize); uint32 freecount = 0; for(uint32 currentpage = _numpages; currentpage > 0;) { // this already includes all superblocks @@ -1270,6 +1275,7 @@ namespace mallocMC * @param slotSize the size of allocatable elements to count * @param obj a reference to the allocator instance (host-side) */ + public: template static auto getAvailableSlotsHost( @@ -1282,15 +1288,15 @@ namespace mallocMC alpaka::memset(queue, d_slots, 0, 1); auto getAvailableSlotsKernel = [] ALPAKA_FN_ACC( - const AlpakaAcc& acc, + AlpakaAcc const& acc, T_DeviceAllocator* heapPtr, size_t numBytes, unsigned* slots) -> void { - const auto gid = alpaka::getIdx(acc).sum(); + auto const gid = alpaka::getIdx(acc).sum(); - const auto nWorker = alpaka::getWorkDiv(acc).prod(); - const unsigned temp = heapPtr->template getAvailaibleSlotsDeviceFunction< + auto const nWorker = alpaka::getWorkDiv(acc).prod(); + unsigned const temp = heapPtr->template getAvailaibleSlotsDeviceFunction< typename T_DeviceAllocator::AlignmentPolicy>(acc, numBytes, gid, nWorker); if(temp) alpaka::atomicOp(acc, slots, temp); @@ -1310,7 +1316,7 @@ namespace mallocMC threadsPerBlock[Dim::value - 1] = std::min(static_cast(256u), static_cast(devProps.m_blockThreadCountMax)); - const auto workDiv = alpaka::WorkDivMembers{ + auto const workDiv = alpaka::WorkDivMembers{ numBlocks, threadsPerBlock, VecType::ones()}; // Dim may be any dimension, but workDiv is 1D @@ -1325,7 +1331,7 @@ namespace mallocMC alpaka::getPtrNative(d_slots))); auto const platform = alpaka::Platform{}; - const auto hostDev = alpaka::getDevByIdx(platform, 0); + auto const hostDev = alpaka::getDevByIdx(platform, 0); auto h_slots = alpaka::allocBuf(hostDev, 1); alpaka::memcpy(queue, h_slots, d_slots, 1); @@ -1349,13 +1355,13 @@ namespace mallocMC * @param slotSize the size of allocatable elements to count */ template - ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(const AlpakaAcc& acc, size_t slotSize) -> unsigned + ALPAKA_FN_ACC auto getAvailableSlotsAccelerator(AlpakaAcc const& acc, size_t slotSize) -> unsigned { - const int wId = warpid_withinblock(acc); // do not use warpid-function, since + int const wId = warpid_withinblock(acc); // do not use warpid-function, since // this value is not guaranteed to // be stable across warp lifetime - const uint32 activeThreads = popc(activemask()); + uint32 const activeThreads = popc(activemask()); auto& activePerWarp = alpaka::declareSharedVar< std::uint32_t[maxThreadsPerBlock / warpSize], @@ -1373,11 +1379,11 @@ namespace mallocMC // the active threads obtain an id from 0 to activeThreads-1 if(slotSize == 0) return 0; - const auto linearId = alpaka::atomicOp(acc, &activePerWarp[wId], 1u); + auto const linearId = alpaka::atomicOp(acc, &activePerWarp[wId], 1u); // printf("Block %d, id %d: activeThreads=%d // linearId=%d\n",blockIdx.x,threadIdx.x,activeThreads,linearId); - const unsigned temp = this->template getAvailaibleSlotsDeviceFunction( + unsigned const temp = this->template getAvailaibleSlotsDeviceFunction( acc, slotSize, linearId, diff --git a/src/include/mallocMC/device_allocator.hpp b/src/include/mallocMC/device_allocator.hpp index 8bb6c2f4..52e4e736 100644 --- a/src/include/mallocMC/device_allocator.hpp +++ b/src/include/mallocMC/device_allocator.hpp @@ -33,6 +33,7 @@ #include "mallocMC_utils.hpp" #include + #include #include @@ -70,11 +71,11 @@ namespace mallocMC void* pool; template - ALPAKA_FN_ACC auto malloc(const AlpakaAcc& acc, size_t bytes) -> void* + ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t bytes) -> void* { bytes = AlignmentPolicy::applyPadding(bytes); DistributionPolicy distributionPolicy(acc); - const uint32 req_size = distributionPolicy.collect(acc, bytes); + uint32 const req_size = distributionPolicy.collect(acc, bytes); void* memBlock = CreationPolicy::template create(acc, req_size); if(CreationPolicy::isOOM(memBlock, req_size)) memBlock = OOMPolicy::handleOOM(memBlock); @@ -82,7 +83,7 @@ namespace mallocMC } template - ALPAKA_FN_ACC void free(const AlpakaAcc& acc, void* p) + ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* p) { CreationPolicy::destroy(acc, p); } @@ -96,7 +97,7 @@ namespace mallocMC * device side 0 will be returned. */ template - ALPAKA_FN_ACC auto getAvailableSlots(const AlpakaAcc& acc, size_t slotSize) -> unsigned + ALPAKA_FN_ACC auto getAvailableSlots(AlpakaAcc const& acc, size_t slotSize) -> unsigned { slotSize = AlignmentPolicy::applyPadding(slotSize); if constexpr(Traits::providesAvailableSlots) diff --git a/src/include/mallocMC/distributionPolicies/Noop.hpp b/src/include/mallocMC/distributionPolicies/Noop.hpp index 19cc471d..2f43640f 100644 --- a/src/include/mallocMC/distributionPolicies/Noop.hpp +++ b/src/include/mallocMC/distributionPolicies/Noop.hpp @@ -30,6 +30,7 @@ #include "Noop.hpp" #include + #include #include @@ -49,18 +50,18 @@ namespace mallocMC public: template - ALPAKA_FN_ACC Noop(const AlpakaAcc& /*acc*/) + ALPAKA_FN_ACC Noop(AlpakaAcc const& /*acc*/) { } template - ALPAKA_FN_ACC auto collect(const AlpakaAcc& /*acc*/, uint32 bytes) const -> uint32 + ALPAKA_FN_ACC auto collect(AlpakaAcc const& /*acc*/, uint32 bytes) const -> uint32 { return bytes; } template - ALPAKA_FN_ACC auto distribute(const AlpakaAcc& /*acc*/, void* allocatedMem) const -> void* + ALPAKA_FN_ACC auto distribute(AlpakaAcc const& /*acc*/, void* allocatedMem) const -> void* { return allocatedMem; } diff --git a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp index a67dd333..eb4f3d59 100644 --- a/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp +++ b/src/include/mallocMC/distributionPolicies/XMallocSIMD.hpp @@ -37,6 +37,7 @@ #include "XMallocSIMD.hpp" #include + #include #include #include @@ -88,7 +89,7 @@ namespace mallocMC using Properties = T_Config; template - ALPAKA_FN_ACC XMallocSIMD(const AlpakaAcc& acc) + ALPAKA_FN_ACC XMallocSIMD(AlpakaAcc const& acc) : can_use_coalescing(false) , warpid(warpid_withinblock(acc)) , myoffset(0) @@ -116,7 +117,7 @@ namespace mallocMC static constexpr uint32 _pagesize = pagesize; template - ALPAKA_FN_ACC auto collect(const AlpakaAcc& acc, uint32 bytes) -> uint32 + ALPAKA_FN_ACC auto collect(AlpakaAcc const& acc, uint32 bytes) -> uint32 { can_use_coalescing = false; myoffset = 0; @@ -129,7 +130,7 @@ namespace mallocMC // second half: make sure that all coalesced allocations can fit // within one page necessary for offset calculation - const bool coalescible = bytes > 0 && bytes < (pagesize / 32); + bool const coalescible = bytes > 0 && bytes < (pagesize / 32); #if(MALLOCMC_DEVICE_COMPILE) threadcount = popc(ballot(coalescible)); @@ -150,7 +151,7 @@ namespace mallocMC } template - ALPAKA_FN_ACC auto distribute(const AlpakaAcc& acc, void* allocatedMem) -> void* + ALPAKA_FN_ACC auto distribute(AlpakaAcc const& acc, void* allocatedMem) -> void* { auto& warp_res = alpaka::declareSharedVar(acc); diff --git a/src/include/mallocMC/mallocMC_allocator_handle.hpp b/src/include/mallocMC/mallocMC_allocator_handle.hpp index 4d5361a7..1da222fa 100644 --- a/src/include/mallocMC/mallocMC_allocator_handle.hpp +++ b/src/include/mallocMC/mallocMC_allocator_handle.hpp @@ -44,19 +44,19 @@ namespace mallocMC } template - ALPAKA_FN_ACC auto malloc(const AlpakaAcc& acc, size_t size) -> void* + ALPAKA_FN_ACC auto malloc(AlpakaAcc const& acc, size_t size) -> void* { return devAllocator->malloc(acc, size); } template - ALPAKA_FN_ACC void free(const AlpakaAcc& acc, void* p) + ALPAKA_FN_ACC void free(AlpakaAcc const& acc, void* p) { devAllocator->free(acc, p); } template - ALPAKA_FN_ACC auto getAvailableSlots(const AlpakaAcc& acc, size_t slotSize) -> unsigned + ALPAKA_FN_ACC auto getAvailableSlots(AlpakaAcc const& acc, size_t slotSize) -> unsigned { return devAllocator->getAvailableSlots(acc, slotSize); } diff --git a/src/include/mallocMC/mallocMC_utils.hpp b/src/include/mallocMC/mallocMC_utils.hpp index 6bf953b4..c1c9b24f 100644 --- a/src/include/mallocMC/mallocMC_utils.hpp +++ b/src/include/mallocMC/mallocMC_utils.hpp @@ -62,6 +62,7 @@ namespace mallocMC public: using type = unsigned int; }; + template<> class __PointerEquivalent<8> { @@ -161,7 +162,6 @@ namespace mallocMC #endif } - ALPAKA_FN_ACC inline auto activemask() { #if defined(__CUDA_ARCH__) @@ -194,9 +194,9 @@ namespace mallocMC * @return warp id within the block */ template - ALPAKA_FN_ACC inline auto warpid_withinblock(const AlpakaAcc& acc) -> std::uint32_t + ALPAKA_FN_ACC inline auto warpid_withinblock(AlpakaAcc const& acc) -> std::uint32_t { - const auto localId = alpaka::mapIdx<1>( + auto const localId = alpaka::mapIdx<1>( alpaka::getIdx(acc), alpaka::getWorkDiv(acc))[0]; return localId / warpSize; diff --git a/src/include/mallocMC/oOMPolicies/BadAllocException.hpp b/src/include/mallocMC/oOMPolicies/BadAllocException.hpp index f3a3bb75..7d7dfcad 100644 --- a/src/include/mallocMC/oOMPolicies/BadAllocException.hpp +++ b/src/include/mallocMC/oOMPolicies/BadAllocException.hpp @@ -30,6 +30,7 @@ #include "BadAllocException.hpp" #include + #include #include diff --git a/src/include/mallocMC/oOMPolicies/ReturnNull.hpp b/src/include/mallocMC/oOMPolicies/ReturnNull.hpp index 5ddd698a..dbea98e7 100644 --- a/src/include/mallocMC/oOMPolicies/ReturnNull.hpp +++ b/src/include/mallocMC/oOMPolicies/ReturnNull.hpp @@ -30,6 +30,7 @@ #include "ReturnNull.hpp" #include + #include namespace mallocMC diff --git a/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp b/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp index ca8bcb40..5ffa1980 100644 --- a/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp +++ b/src/include/mallocMC/reservePoolPolicies/AlpakaBuf.hpp @@ -28,6 +28,7 @@ #pragma once #include + #include #include @@ -39,7 +40,7 @@ namespace mallocMC struct AlpakaBuf { template - auto setMemPool(const AlpakaDev& dev, size_t memsize) -> void* + auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void* { poolBuffer = std::make_unique(alpaka::allocBuf(dev, memsize)); return alpaka::getPtrNative(*poolBuffer); diff --git a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp index d2d9be66..b77132f9 100644 --- a/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp +++ b/src/include/mallocMC/reservePoolPolicies/CudaSetLimits.hpp @@ -32,6 +32,7 @@ # include "CudaSetLimits.hpp" # include + # include # include @@ -57,7 +58,7 @@ namespace mallocMC struct CudaSetLimits { template - auto setMemPool(const AlpakaDev& dev, size_t memsize) -> void* + auto setMemPool(AlpakaDev const& dev, size_t memsize) -> void* { cudaDeviceSetLimit(cudaLimitMallocHeapSize, memsize); return nullptr; diff --git a/tests/dimensions.cpp b/tests/dimensions.cpp index bd7e3343..a39ff448 100644 --- a/tests/dimensions.cpp +++ b/tests/dimensions.cpp @@ -26,6 +26,7 @@ */ #include + #include #include @@ -77,7 +78,7 @@ void test1D() mallocMC::AlignmentPolicies::Shrink>; auto const platform = alpaka::Platform{}; - const auto dev = alpaka::getDevByIdx(platform, 0); + auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = alpaka::Queue{dev}; constexpr auto N = 16; @@ -90,7 +91,7 @@ void test1D() queue, alpaka::createTaskKernel( alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}, - [] ALPAKA_FN_ACC(const Acc& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) { deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * dim * dim); }, N, scatterAlloc.getAllocatorHandle())); @@ -100,15 +101,15 @@ void test1D() queue, alpaka::createTaskKernel( alpaka::WorkDivMembers{Idx{N}, Idx{N}, Idx{1}}, - [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) { - const auto i = alpaka::getIdx(acc)[0]; + auto const i = alpaka::getIdx(acc)[0]; deviceArray[i] = (int*) allocHandle.malloc(acc, sizeof(int)); }, scatterAlloc.getAllocatorHandle())); - const auto slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); - const auto heapInfo = scatterAlloc.getHeapLocations().at(0); + auto const slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); + auto const heapInfo = scatterAlloc.getHeapLocations().at(0); std::cout << alpaka::trait::GetAccName::getAccName() << " slots: " << slots << " heap size: " << heapInfo.size << '\n'; @@ -117,9 +118,9 @@ void test1D() queue, alpaka::createTaskKernel( alpaka::WorkDivMembers{Idx{N}, Idx{N}, Idx{1}}, - [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) { - const auto i = alpaka::getIdx(acc)[0]; + auto const i = alpaka::getIdx(acc)[0]; allocHandle.free(acc, deviceArray[i]); }, scatterAlloc.getAllocatorHandle())); @@ -129,7 +130,7 @@ void test1D() queue, alpaka::createTaskKernel( alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}, - [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) { allocHandle.free(acc, deviceArray); }, scatterAlloc.getAllocatorHandle())); } @@ -149,7 +150,7 @@ void test2D() mallocMC::AlignmentPolicies::Shrink>; auto const platform = alpaka::Platform{}; - const auto dev = alpaka::getDevByIdx(platform, 0); + auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = alpaka::Queue{dev}; constexpr auto N = 8; @@ -165,7 +166,7 @@ void test2D() alpaka::Vec::all(1), alpaka::Vec::all(1), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) { deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * dim * dim * dim * dim); }, N, scatterAlloc.getAllocatorHandle())); @@ -178,16 +179,16 @@ void test2D() alpaka::Vec::all(N), alpaka::Vec::all(N), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) { - const auto idx = alpaka::getIdx(acc); + auto const idx = alpaka::getIdx(acc); deviceArray[idx[0] * dim * dim + idx[1]] = (int*) allocHandle.malloc(acc, sizeof(int)); }, N, scatterAlloc.getAllocatorHandle())); - const auto slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); - const auto heapInfo = scatterAlloc.getHeapLocations().at(0); + auto const slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); + auto const heapInfo = scatterAlloc.getHeapLocations().at(0); std::cout << alpaka::trait::GetAccName::getAccName() << " slots: " << slots << " heap size: " << heapInfo.size << '\n'; @@ -199,9 +200,9 @@ void test2D() alpaka::Vec::all(N), alpaka::Vec::all(N), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) { - const auto idx = alpaka::getIdx(acc); + auto const idx = alpaka::getIdx(acc); allocHandle.free(acc, deviceArray[idx[0] * dim * dim + idx[1]]); }, N, @@ -215,7 +216,7 @@ void test2D() alpaka::Vec::all(1), alpaka::Vec::all(1), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) { allocHandle.free(acc, deviceArray); }, scatterAlloc.getAllocatorHandle())); } @@ -235,7 +236,7 @@ void test3D() mallocMC::AlignmentPolicies::Shrink>; auto const platform = alpaka::Platform{}; - const auto dev = alpaka::getDevByIdx(platform, 0); + auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = alpaka::Queue{dev}; constexpr auto N = 4; @@ -251,7 +252,7 @@ void test3D() alpaka::Vec::all(1), alpaka::Vec::all(1), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) { deviceArray = (int**) allocHandle.malloc(acc, sizeof(int*) * dim * dim * dim * dim * dim * dim); }, N, scatterAlloc.getAllocatorHandle())); @@ -265,17 +266,17 @@ void test3D() alpaka::Vec::all(N), alpaka::Vec::all(N), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) { - const auto idx = alpaka::getIdx(acc); + auto const idx = alpaka::getIdx(acc); deviceArray[idx[0] * dim * dim * dim * dim + idx[1] * dim * dim + idx[0]] = (int*) allocHandle.malloc(acc, sizeof(int)); }, N, scatterAlloc.getAllocatorHandle())); - const auto slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); - const auto heapInfo = scatterAlloc.getHeapLocations().at(0); + auto const slots = scatterAlloc.getAvailableSlots(dev, queue, sizeof(int)); + auto const heapInfo = scatterAlloc.getHeapLocations().at(0); std::cout << alpaka::trait::GetAccName::getAccName() << " slots: " << slots << " heap size: " << heapInfo.size << '\n'; @@ -288,9 +289,9 @@ void test3D() alpaka::Vec::all(N), alpaka::Vec::all(N), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, int dim, typename ScatterAllocator::AllocatorHandle allocHandle) { - const auto idx = alpaka::getIdx(acc); + auto const idx = alpaka::getIdx(acc); allocHandle.free(acc, deviceArray[idx[0] * dim * dim * dim * dim + idx[1] * dim * dim + idx[0]]); }, N, @@ -304,7 +305,7 @@ void test3D() alpaka::Vec::all(1), alpaka::Vec::all(1), alpaka::Vec::all(1)}, - [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) { allocHandle.free(acc, deviceArray); }, scatterAlloc.getAllocatorHandle())); } diff --git a/tests/policies.cpp b/tests/policies.cpp index cbb6f453..5e9c9cb3 100644 --- a/tests/policies.cpp +++ b/tests/policies.cpp @@ -26,6 +26,7 @@ */ #include + #include #include @@ -64,7 +65,7 @@ template void run() { auto const platform = alpaka::Platform{}; - const auto dev = alpaka::getDevByIdx(platform, 0); + auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = alpaka::Queue{dev}; @@ -73,7 +74,7 @@ void run() queue, alpaka::createTaskKernel( alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}, - [] ALPAKA_FN_ACC(const Acc& acc, typename ScatterAllocator::AllocatorHandle allocHandle) + [] ALPAKA_FN_ACC(Acc const& acc, typename ScatterAllocator::AllocatorHandle allocHandle) { auto* ptr = allocHandle.malloc(acc, sizeof(int) * 1000); allocHandle.free(acc, ptr); diff --git a/tests/verify_heap.cpp b/tests/verify_heap.cpp index b6f1a103..7d696a10 100644 --- a/tests/verify_heap.cpp +++ b/tests/verify_heap.cpp @@ -32,11 +32,13 @@ constexpr auto ELEMS_PER_SLOT = 750; #include "verify_heap_config.hpp" -#include #include + +#include + +#include #include #include -#include #include #include #include @@ -50,8 +52,8 @@ bool verbose = false; // the type of the elements to allocate using allocElem_t = unsigned long long; -auto run_heap_verification(const size_t, const unsigned, unsigned, const bool) -> bool; -void parse_cmdline(const int, char**, size_t*, unsigned*, unsigned*, bool*); +auto run_heap_verification(size_t const, unsigned const, unsigned, bool const) -> bool; +void parse_cmdline(int const, char**, size_t*, unsigned*, unsigned*, bool*); void print_help(char**); // used to create an empty stream for non-verbose output @@ -92,7 +94,7 @@ auto main(int argc, char** argv) -> int parse_cmdline(argc, argv, &heapInMB, &threads, &blocks, &machine_readable); - const auto correct = run_heap_verification(heapInMB, threads, blocks, machine_readable); + auto const correct = run_heap_verification(heapInMB, threads, blocks, machine_readable); if(!machine_readable || verbose) { if(correct) @@ -120,7 +122,7 @@ auto main(int argc, char** argv) -> int * @param blocks will be filled with number of blocks, if given as a parameter */ void parse_cmdline( - const int argc, + int const argc, char** argv, size_t* heapInMB, unsigned* threads, @@ -241,22 +243,22 @@ void print_help(char** argv) struct Check_content { ALPAKA_FN_ACC void operator()( - const Acc& acc, + Acc const& acc, allocElem_t** data, unsigned long long* counter, unsigned long long* globalSum, - const size_t nSlots, + size_t const nSlots, int* correct) const { unsigned long long sum = 0; while(true) { - const size_t pos = alpaka::atomicOp(acc, counter, 1ull); + size_t const pos = alpaka::atomicOp(acc, counter, 1ull); if(pos >= nSlots) { break; } - const size_t offset = pos * ELEMS_PER_SLOT; + size_t const offset = pos * ELEMS_PER_SLOT; for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) { if(static_cast(data[pos][i]) != static_cast(offset + i)) @@ -290,10 +292,10 @@ struct Check_content struct Check_content_fast { ALPAKA_FN_ACC void operator()( - const Acc& acc, + Acc const& acc, allocElem_t** data, unsigned long long* counter, - const size_t nSlots, + size_t const nSlots, int* correct) const { int c = 1; @@ -304,7 +306,7 @@ struct Check_content_fast { break; } - const size_t offset = pos * ELEMS_PER_SLOT; + size_t const offset = pos * ELEMS_PER_SLOT; for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) { if(static_cast(data[pos][i]) != static_cast(offset + i)) @@ -333,7 +335,7 @@ struct Check_content_fast struct AllocAll { ALPAKA_FN_ACC void operator()( - const Acc& acc, + Acc const& acc, allocElem_t** data, unsigned long long* counter, unsigned long long* globalSum, @@ -347,7 +349,7 @@ struct AllocAll break; size_t pos = alpaka::atomicOp(acc, counter, 1ull); - const size_t offset = pos * ELEMS_PER_SLOT; + size_t const offset = pos * ELEMS_PER_SLOT; for(size_t i = 0; i < ELEMS_PER_SLOT; ++i) { p[i] = static_cast(offset + i); @@ -371,10 +373,10 @@ struct AllocAll struct DeallocAll { ALPAKA_FN_ACC void operator()( - const Acc& acc, + Acc const& acc, allocElem_t** data, unsigned long long* counter, - const size_t nSlots, + size_t const nSlots, ScatterAllocator::AllocatorHandle mMC) const { while(true) @@ -398,7 +400,7 @@ struct DeallocAll */ struct DamageElement { - ALPAKA_FN_ACC void operator()(const Acc& acc, allocElem_t** data) const + ALPAKA_FN_ACC void operator()(Acc const& acc, allocElem_t** data) const { data[1][0] = static_cast(5 * ELEMS_PER_SLOT - 1); } @@ -419,13 +421,13 @@ struct DamageElement * @param threads the number of CUDA threads per block */ void allocate( - const Device& dev, + Device const& dev, Queue& queue, alpaka::Buf& d_testData, unsigned long long* nSlots, unsigned long long* sum, - const unsigned blocks, - const unsigned threads, + unsigned const blocks, + unsigned const threads, ScatterAllocator& mMC) { dout() << "allocating on device..."; @@ -436,7 +438,7 @@ void allocate( alpaka::memset(queue, d_sum, 0, 1); alpaka::memset(queue, d_nSlots, 0, 1); - const auto workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel( @@ -448,7 +450,7 @@ void allocate( mMC.getAllocatorHandle())); auto const platform = alpaka::Platform{}; - const auto hostDev = alpaka::getDevByIdx(platform, 0); + auto const hostDev = alpaka::getDevByIdx(platform, 0); auto h_sum = alpaka::allocBuf(hostDev, Idx{1}); auto h_nSlots = alpaka::allocBuf(hostDev, Idx{1}); @@ -477,17 +479,17 @@ void allocate( * @return true if the verification was successful, false otherwise */ auto verify( - const Device& dev, + Device const& dev, Queue& queue, alpaka::Buf& d_testData, - const unsigned long long nSlots, - const unsigned blocks, - const unsigned threads) -> bool + unsigned long long const nSlots, + unsigned const blocks, + unsigned const threads) -> bool { dout() << "verifying on device... "; auto const platform = alpaka::Platform{}; - const auto hostDev = alpaka::getDevByIdx(platform, 0); + auto const hostDev = alpaka::getDevByIdx(platform, 0); auto h_correct = alpaka::allocBuf(hostDev, Idx{1}); *alpaka::getPtrNative(h_correct) = 1; @@ -503,7 +505,7 @@ auto verify( // can be replaced by a call to check_content_fast, // if the gaussian sum (see below) is not used and you // want to be a bit faster - const auto workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel( @@ -518,7 +520,7 @@ auto verify( alpaka::memcpy(queue, h_correct, d_correct, 1); alpaka::wait(queue); - const auto correct = *alpaka::getPtrNative(h_correct); + auto const correct = *alpaka::getPtrNative(h_correct); dout() << (correct ? "done\n" : "failed\n"); return correct != 0; } @@ -529,22 +531,22 @@ auto verify( * for params, see run_heap_verification-internal parameters */ void print_machine_readable( - const unsigned pagesize, - const unsigned accessblocks, - const unsigned regionsize, - const unsigned wastefactor, - const bool resetfreedpages, - const unsigned blocks, - const unsigned threads, - const unsigned elemsPerSlot, - const size_t allocElemSize, - const size_t heapSize, - const size_t maxSpace, - const size_t maxSlots, - const unsigned long long usedSlots, - const float allocFrac, - const size_t wasted, - const bool correct) + unsigned const pagesize, + unsigned const accessblocks, + unsigned const regionsize, + unsigned const wastefactor, + bool const resetfreedpages, + unsigned const blocks, + unsigned const threads, + unsigned const elemsPerSlot, + size_t const allocElemSize, + size_t const heapSize, + size_t const maxSpace, + size_t const maxSlots, + unsigned long long const usedSlots, + float const allocFrac, + size_t const wasted, + bool const correct) { std::string sep = ","; std::stringstream h; @@ -616,21 +618,21 @@ void print_machine_readable( * @return true if the verification was successful, * false otherwise */ -auto run_heap_verification(const size_t heapMB, const unsigned blocks, unsigned threads, const bool machine_readable) +auto run_heap_verification(size_t const heapMB, unsigned const blocks, unsigned threads, bool const machine_readable) -> bool { auto const platform = alpaka::Platform{}; - const auto dev = alpaka::getDevByIdx(platform, 0); + auto const dev = alpaka::getDevByIdx(platform, 0); auto queue = Queue{dev}; auto const devProps = alpaka::getAccDevProps(dev); threads = std::min(static_cast(threads), static_cast(devProps.m_blockThreadCountMax)); - const size_t heapSize = size_t(1024U * 1024U) * heapMB; - const size_t slotSize = sizeof(allocElem_t) * ELEMS_PER_SLOT; - const size_t nPointers = (heapSize + slotSize - 1) / slotSize; - const size_t maxSlots = heapSize / slotSize; - const size_t maxSpace = maxSlots * slotSize + nPointers * sizeof(allocElem_t*); + size_t const heapSize = size_t(1024U * 1024U) * heapMB; + size_t const slotSize = sizeof(allocElem_t) * ELEMS_PER_SLOT; + size_t const nPointers = (heapSize + slotSize - 1) / slotSize; + size_t const maxSlots = heapSize / slotSize; + size_t const maxSpace = maxSlots * slotSize + nPointers * sizeof(allocElem_t*); bool correct = true; dout() << "CreationPolicy Arguments:\n"; @@ -675,7 +677,7 @@ auto run_heap_verification(const size_t heapMB, const unsigned blocks, unsigned // damaging one cell dout() << "damaging of element... "; { - const auto workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{1}, Idx{1}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel(workDiv, DamageElement{}, alpaka::getPtrNative(d_testData))); @@ -692,7 +694,7 @@ auto run_heap_verification(const size_t heapMB, const unsigned blocks, unsigned auto d_dealloc_counter = alpaka::allocBuf(dev, Idx{1}); alpaka::memset(queue, d_dealloc_counter, 0, 1); { - const auto workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; + auto const workDiv = alpaka::WorkDivMembers{Idx{blocks}, Idx{threads}, Idx{1}}; alpaka::enqueue( queue, alpaka::createTaskKernel( diff --git a/tests/verify_heap_config.hpp b/tests/verify_heap_config.hpp index 4874b161..1ca2b8bb 100644 --- a/tests/verify_heap_config.hpp +++ b/tests/verify_heap_config.hpp @@ -30,6 +30,7 @@ #include #include + #include using Dim = alpaka::DimInt<1>;