From ce65ae054a3c7e71c93b7f368bc1e068adf19aec Mon Sep 17 00:00:00 2001 From: Daniel Seemaier Date: Thu, 25 Apr 2024 16:55:46 +0200 Subject: [PATCH 1/3] feat: add option to use huge pages --- CMakeLists.txt | 8 ++ apps/io/shm_compressed_graph_binary.cc | 2 +- apps/io/shm_io.cc | 8 +- kaminpar-common/datastructures/static_array.h | 82 +++++++++++++------ .../datastructures/compressed_graph.cc | 2 +- kaminpar-shm/graphutils/permutator.cc | 19 +++-- kaminpar-shm/graphutils/permutator.h | 4 +- kaminpar-shm/graphutils/subgraph_extractor.cc | 24 +++--- kaminpar-shm/kaminpar.cc | 16 ++-- kaminpar-shm/label_propagation.h | 5 +- 10 files changed, 110 insertions(+), 60 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c0136f2c..959f7a62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ option(KAMINPAR_ENABLE_PAGE_PROFILING "Profile pages allocated via mmap." OFF) option(KAMINPAR_ENABLE_STATISTICS "Generate and output detailed statistics." OFF) option(KAMINPAR_ENABLE_TIMERS "Measure running times. Must be set to 'OFF' if the library interface is used from multiple threads simulatinously." ON) option(KAMINPAR_ENABLE_TIMER_BARRIERS "Add additional MPI_Barrier() instructions for more accurate time measurements." ON) +option(KAMINPAR_ENABLE_HUGE_PAGES "Use huge pages for large allocations, if available." ON) option(KAMINPAR_BUILD_WITH_ASAN "Enable address sanitizer." OFF) option(KAMINPAR_BUILD_WITH_UBSAN "Enable undefined behaviour sanitizer." OFF) @@ -203,6 +204,13 @@ else () message(STATUS "Timer barriers: disabled") endif () +if (KAMINPAR_ENABLE_HUGE_PAGES) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_HUGE_PAGES") + message(STATUS "Huge pages: enabled") +else () + message(STATUS "Huge pages: disabled") +endif () + message(STATUS "Graph compression summary:") if (KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING) diff --git a/apps/io/shm_compressed_graph_binary.cc b/apps/io/shm_compressed_graph_binary.cc index b1c3ad23..73f75fcb 100644 --- a/apps/io/shm_compressed_graph_binary.cc +++ b/apps/io/shm_compressed_graph_binary.cc @@ -93,7 +93,7 @@ template static StaticArray read_static_array(std::ifstream &in, const std::size_t size) { T *ptr = static_cast(std::malloc(sizeof(T) * size)); in.read(reinterpret_cast(ptr), sizeof(T) * size); - return StaticArray(ptr, size); + return StaticArray(size, ptr); } CompressedGraph read(const std::string &filename) { diff --git a/apps/io/shm_io.cc b/apps/io/shm_io.cc index 36b9f251..c0dc37a2 100644 --- a/apps/io/shm_io.cc +++ b/apps/io/shm_io.cc @@ -158,15 +158,15 @@ template CSRGraph csr_read(const std::string &filename, const boo store_node_weights = format.has_node_weights; store_edge_weights = format.has_edge_weights; - nodes.resize(format.number_of_nodes + 1); - edges.resize(format.number_of_edges * 2); + nodes.resize(format.number_of_nodes + 1, static_array::huge, static_array::noinit); + edges.resize(format.number_of_edges * 2, static_array::huge, static_array::noinit); if (store_node_weights) { - node_weights.resize(format.number_of_nodes); + node_weights.resize(format.number_of_nodes, static_array::huge, static_array::noinit); } if (store_edge_weights) { - edge_weights.resize(format.number_of_edges * 2); + edge_weights.resize(format.number_of_edges * 2, static_array::huge, static_array::noinit); } }, [&](const std::uint64_t weight) { diff --git a/kaminpar-common/datastructures/static_array.h b/kaminpar-common/datastructures/static_array.h index 2cc29a00..a6fb3932 100644 --- a/kaminpar-common/datastructures/static_array.h +++ b/kaminpar-common/datastructures/static_array.h @@ -6,22 +6,34 @@ ******************************************************************************/ #pragma once +#include #include #include #include #include +#include #include #include +#ifdef KAMINPAR_ENABLE_HUGE_PAGES +#include "stdlib.h" +#include "sys/mman.h" +#endif // KAMINPAR_ENABLE_HUGE_PAGES + #include "kaminpar-common/assert.h" #include "kaminpar-common/heap_profiler.h" -#include "kaminpar-common/parallel/tbb_malloc.h" namespace kaminpar { namespace static_array { constexpr struct noinit_t { } noinit; + +constexpr struct huge_t { +} huge; + +constexpr struct seq_t { +} seq; } // namespace static_array template class StaticArray { @@ -128,27 +140,19 @@ template class StaticArray { using iterator = StaticArrayIterator; using const_iterator = const StaticArrayIterator; - StaticArray(T *storage, const std::size_t size) : _size(size), _data(storage) { - RECORD_DATA_STRUCT(size * sizeof(T), _struct); - } - - StaticArray(const std::size_t start, const std::size_t size, StaticArray &data) - : StaticArray(size, data._data + start) { - KASSERT(start + size <= data.size()); - } - StaticArray(const std::size_t size, value_type *data) : _size(size), _data(data) { RECORD_DATA_STRUCT(size * sizeof(T), _struct); } - StaticArray(const std::size_t size, const value_type init_value = value_type()) { + template + StaticArray(const std::size_t size, const value_type init_value, Tags... tags) { RECORD_DATA_STRUCT(0, _struct); - resize(size, init_value); + resize(size, init_value, std::forward(tags)...); } - StaticArray(const std::size_t size, static_array::noinit_t) { + template StaticArray(const std::size_t size, Tags... tags) { RECORD_DATA_STRUCT(0, _struct); - resize(size, static_array::noinit); + resize(size, value_type(), std::forward(tags)...); } template @@ -283,18 +287,26 @@ template class StaticArray { return _size; } - void resize(const std::size_t size, static_array::noinit_t) { - KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always); - allocate_data(size); + template void resize(const std::size_t size, Tags &&...tags) { + resize(size, value_type(), std::forward(tags)...); } - void resize( - const size_type size, - const value_type init_value = value_type(), - const bool assign_parallel = true - ) { - resize(size, static_array::noinit); - assign(size, init_value, assign_parallel); + template + void resize(const std::size_t size, const value_type init_value, Tags &&...tags) { + KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always); + if (size > 0 && (std::is_same_v || ...)) { + allocate_huge_data(size); + } else { + allocate_data(size); + } + + if constexpr (!(std::is_same_v || ...)) { + if constexpr ((std::is_same_v || ...)) { + assign(size, init_value, false); + } else { + assign(size, init_value); + } + } } void assign(const size_type count, const value_type value, const bool assign_parallel = true) { @@ -314,7 +326,7 @@ template class StaticArray { } } - parallel::tbb_unique_ptr free() { + std::unique_ptr free() { _size = 0; _unrestricted_size = 0; _data = nullptr; @@ -322,8 +334,24 @@ template class StaticArray { } private: + void allocate_huge_data(const std::size_t size) { +#ifdef KAMINPAR_ENABLE_HUGE_PAGES + _data = nullptr; + posix_memalign(reinterpret_cast(&_data), 1 << 21, size * sizeof(value_type)); + madvise(_data, size * sizeof(value_type), MADV_HUGEPAGE); + + _owned_data.reset(_data); + _size = size; + _unrestricted_size = _size; + + IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, size * sizeof(value_type))); +#else // KAMINPAR_ENABLE_HUGE_PAGES + allocate_data(size); +#endif // KAMINPAR_ENABLE_HUGE_PAGES + } + void allocate_data(const std::size_t size) { - _owned_data = parallel::make_unique(size); + _owned_data = std::make_unique(size); _data = _owned_data.get(); _size = size; _unrestricted_size = _size; @@ -333,7 +361,7 @@ template class StaticArray { size_type _size = 0; size_type _unrestricted_size = 0; - parallel::tbb_unique_ptr _owned_data = nullptr; + std::unique_ptr _owned_data = nullptr; value_type *_data = nullptr; IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); diff --git a/kaminpar-shm/datastructures/compressed_graph.cc b/kaminpar-shm/datastructures/compressed_graph.cc index 5091ac46..bd025b97 100644 --- a/kaminpar-shm/datastructures/compressed_graph.cc +++ b/kaminpar-shm/datastructures/compressed_graph.cc @@ -383,7 +383,7 @@ CompressedGraph CompressedGraphBuilder::build() { const std::size_t stored_bytes = static_cast(_cur_compressed_edges - _compressed_edges); RECORD("compressed_edges") - StaticArray compressed_edges(_compressed_edges, stored_bytes); + StaticArray compressed_edges(stored_bytes, _compressed_edges); if constexpr (kHeapProfiling) { heap_profiler::HeapProfiler::global().record_alloc(_compressed_edges, stored_bytes); diff --git a/kaminpar-shm/graphutils/permutator.cc b/kaminpar-shm/graphutils/permutator.cc index 734e4ceb..40d0d0b6 100644 --- a/kaminpar-shm/graphutils/permutator.cc +++ b/kaminpar-shm/graphutils/permutator.cc @@ -28,12 +28,18 @@ NodePermutations rearrange_graph( ) { START_HEAP_PROFILER("Temporal nodes and edges allocation"); START_TIMER("Allocation (noinit)"); - RECORD("tmp_nodes") StaticArray tmp_nodes(nodes.size(), static_array::noinit); - RECORD("tmp_edges") StaticArray tmp_edges(edges.size(), static_array::noinit); + RECORD("tmp_nodes") + StaticArray tmp_nodes(nodes.size(), static_array::huge, static_array::noinit); + RECORD("tmp_edges") + StaticArray tmp_edges(edges.size(), static_array::huge, static_array::noinit); RECORD("tmp_node_weights") - StaticArray tmp_node_weights(node_weights.size(), static_array::noinit); + StaticArray tmp_node_weights( + node_weights.size(), static_array::huge, static_array::noinit + ); RECORD("tmp_edge_weights") - StaticArray tmp_edge_weights(edge_weights.size(), static_array::noinit); + StaticArray tmp_edge_weights( + edge_weights.size(), static_array::huge, static_array::noinit + ); STOP_TIMER(); STOP_HEAP_PROFILER(); @@ -361,7 +367,9 @@ PartitionedGraph assign_isolated_nodes( const NodeID num_nonisolated_nodes = graph.n() - num_isolated_nodes; // The following call graph.n() should include isolated nodes now - RECORD("partition") StaticArray partition(graph.n()); + RECORD("partition") + StaticArray partition(graph.n(), static_array::huge, static_array::noinit); + // copy partition of non-isolated nodes tbb::parallel_for(0, num_nonisolated_nodes, [&](const NodeID u) { partition[u] = p_graph.block(u); @@ -383,5 +391,4 @@ PartitionedGraph assign_isolated_nodes( return {graph, k, std::move(partition)}; } - } // namespace kaminpar::shm::graph diff --git a/kaminpar-shm/graphutils/permutator.h b/kaminpar-shm/graphutils/permutator.h index b641d302..095561cb 100644 --- a/kaminpar-shm/graphutils/permutator.h +++ b/kaminpar-shm/graphutils/permutator.h @@ -45,8 +45,8 @@ NodePermutations sort_by_degree_buckets(const StaticArray & const NodeID n = nodes.size() - 1; const int cpus = std::min(tbb::this_task_arena::max_concurrency(), n); - RECORD("permutation") StaticArray permutation(n); - RECORD("inverse_permutation") StaticArray inverse_permutation(n); + RECORD("permutation") StaticArray permutation(n, static_array::huge); + RECORD("inverse_permutation") StaticArray inverse_permutation(n, static_array::huge); // local_buckets[cpu][bucket]: thread-local bucket sizes using Buckets = std::array + 1>; diff --git a/kaminpar-shm/graphutils/subgraph_extractor.cc b/kaminpar-shm/graphutils/subgraph_extractor.cc index 4daf57bb..a3b05c27 100644 --- a/kaminpar-shm/graphutils/subgraph_extractor.cc +++ b/kaminpar-shm/graphutils/subgraph_extractor.cc @@ -129,17 +129,21 @@ SequentialSubgraphExtractionResult extract_subgraphs_sequential_generic_graph( subgraph_positions[1].edges_start_pos = memory_position.edges_start_pos + m1; auto create_graph = [&](const NodeID n0, const NodeID n, const EdgeID m0, const EdgeID m) { - StaticArray s_nodes(memory_position.nodes_start_pos + n0, n + 1, subgraph_memory.nodes); - StaticArray s_edges(memory_position.edges_start_pos + m0, m, subgraph_memory.edges); + StaticArray s_nodes( + n + 1, subgraph_memory.nodes.data() + memory_position.nodes_start_pos + n0 + ); + StaticArray s_edges( + m, subgraph_memory.edges.data() + memory_position.edges_start_pos + m0 + ); StaticArray s_node_weights( - is_node_weighted * (memory_position.nodes_start_pos + n0), is_node_weighted * n, - subgraph_memory.node_weights + subgraph_memory.node_weights.data() + + is_node_weighted * (memory_position.nodes_start_pos + n0) ); StaticArray s_edge_weights( - is_edge_weighted * (memory_position.edges_start_pos + m0), is_edge_weighted * m, - subgraph_memory.edge_weights + subgraph_memory.edge_weights.data() + + is_edge_weighted * (memory_position.edges_start_pos + m0) ); return shm::Graph(std::make_unique( CSRGraph::seq{}, @@ -291,13 +295,13 @@ SubgraphExtractionResult extract_subgraphs_generic_graph( start_positions[b + 1].nodes_start_pos - n0 - compute_final_k(b, p_graph.k(), input_k); const EdgeID m = start_positions[b + 1].edges_start_pos - m0; - StaticArray nodes(n0, n + 1, subgraph_memory.nodes); - StaticArray edges(m0, m, subgraph_memory.edges); + StaticArray nodes(n + 1, subgraph_memory.nodes.data() + n0); + StaticArray edges(m, subgraph_memory.edges.data() + m0); StaticArray node_weights( - is_node_weighted * n0, is_node_weighted * n, subgraph_memory.node_weights + is_node_weighted * n, subgraph_memory.node_weights.data() + is_node_weighted * n0 ); StaticArray edge_weights( - is_edge_weighted * m0, is_edge_weighted * m, subgraph_memory.edge_weights + is_edge_weighted * m, subgraph_memory.edge_weights.data() + is_edge_weighted * m0 ); subgraphs[b] = shm::Graph(std::make_unique( std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights) diff --git a/kaminpar-shm/kaminpar.cc b/kaminpar-shm/kaminpar.cc index 194692d0..5deb6e09 100644 --- a/kaminpar-shm/kaminpar.cc +++ b/kaminpar-shm/kaminpar.cc @@ -109,14 +109,14 @@ void KaMinPar::borrow_and_mutate_graph( const EdgeID m = xadj[n]; - RECORD("nodes") StaticArray nodes(xadj, n + 1); - RECORD("edges") StaticArray edges(adjncy, m); + RECORD("nodes") StaticArray nodes(n + 1, xadj); + RECORD("edges") StaticArray edges(m, adjncy); RECORD("node_weights") StaticArray node_weights = - (vwgt == nullptr) ? StaticArray(0) : StaticArray(vwgt, n); + (vwgt == nullptr) ? StaticArray(0) : StaticArray(n, vwgt); RECORD("edge_weights") StaticArray edge_weights = - (adjwgt == nullptr) ? StaticArray(0) : StaticArray(adjwgt, m); + (adjwgt == nullptr) ? StaticArray(0) : StaticArray(m, adjwgt); _was_rearranged = false; _graph_ptr = std::make_unique(std::make_unique( @@ -134,10 +134,10 @@ void KaMinPar::copy_graph( const bool has_node_weights = vwgt != nullptr; const bool has_edge_weights = adjwgt != nullptr; - RECORD("nodes") StaticArray nodes(n + 1); - RECORD("edges") StaticArray edges(m); - RECORD("node_weights") StaticArray node_weights(has_node_weights ? n : 0); - RECORD("edge_weights") StaticArray edge_weights(has_edge_weights ? m : 0); + RECORD("nodes") StaticArray nodes(n + 1, static_array::huge); + RECORD("edges") StaticArray edges(m, static_array::huge); + RECORD("node_weights") StaticArray node_weights(has_node_weights ? n : 0, static_array::huge); + RECORD("edge_weights") StaticArray edge_weights(has_edge_weights ? m : 0, static_array::huge); nodes[n] = xadj[n]; tbb::parallel_for(0, n, [&](const NodeID u) { diff --git a/kaminpar-shm/label_propagation.h b/kaminpar-shm/label_propagation.h index c5298024..9fb78ac2 100644 --- a/kaminpar-shm/label_propagation.h +++ b/kaminpar-shm/label_propagation.h @@ -16,6 +16,8 @@ #include #include +#include "kaminpar-shm/kaminpar.h" + #include "kaminpar-common/assert.h" #include "kaminpar-common/datastructures/concurrent_fast_reset_array.h" #include "kaminpar-common/datastructures/concurrent_two_level_vector.h" @@ -23,6 +25,7 @@ #include "kaminpar-common/datastructures/rating_map.h" #include "kaminpar-common/heap_profiler.h" #include "kaminpar-common/logger.h" +#include "kaminpar-common/parallel/algorithm.h" #include "kaminpar-common/parallel/atomic.h" #include "kaminpar-common/random.h" #include "kaminpar-common/tags.h" @@ -281,7 +284,7 @@ template class LabelPropagat } // Compute a mapping from old cluster IDs to new cluster IDs. - RECORD("mapping") StaticArray mapping(_graph->n()); + RECORD("mapping") StaticArray mapping(_graph->n(), static_array::huge); tbb::parallel_for(tbb::blocked_range(0, _graph->n()), [&](const auto &r) { for (NodeID u = r.begin(); u != r.end(); ++u) { const ClusterID c_u = derived_cluster(u); From 8492da0adc022d31baaa45bb8cb4cf7426800e30 Mon Sep 17 00:00:00 2001 From: Daniel Seemaier Date: Thu, 25 Apr 2024 17:30:52 +0200 Subject: [PATCH 2/3] refactor: use tbb functions --- .../datastructures/fast_reset_array.h | 5 +-- kaminpar-common/datastructures/static_array.h | 40 +++++-------------- kaminpar-common/parallel/tbb_malloc.h | 20 +++++++++- kaminpar-shm/graphutils/permutator.cc | 13 +++--- 4 files changed, 37 insertions(+), 41 deletions(-) diff --git a/kaminpar-common/datastructures/fast_reset_array.h b/kaminpar-common/datastructures/fast_reset_array.h index 87142e8e..ad2ef452 100644 --- a/kaminpar-common/datastructures/fast_reset_array.h +++ b/kaminpar-common/datastructures/fast_reset_array.h @@ -8,7 +8,6 @@ #pragma once #include -#include #include "kaminpar-common/assert.h" #include "kaminpar-common/datastructures/scalable_vector.h" @@ -63,7 +62,7 @@ template class FastResetArray { return _data[pos] != Value(); } - [[nodiscard]] std::vector &used_entry_ids() { + [[nodiscard]] scalable_vector &used_entry_ids() { return _used_entries; } @@ -118,7 +117,7 @@ template class FastResetArray { private: scalable_vector _data; - std::vector _used_entries{}; + scalable_vector _used_entries{}; IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); }; diff --git a/kaminpar-common/datastructures/static_array.h b/kaminpar-common/datastructures/static_array.h index a6fb3932..21110281 100644 --- a/kaminpar-common/datastructures/static_array.h +++ b/kaminpar-common/datastructures/static_array.h @@ -16,13 +16,9 @@ #include -#ifdef KAMINPAR_ENABLE_HUGE_PAGES -#include "stdlib.h" -#include "sys/mman.h" -#endif // KAMINPAR_ENABLE_HUGE_PAGES - #include "kaminpar-common/assert.h" #include "kaminpar-common/heap_profiler.h" +#include "kaminpar-common/parallel/tbb_malloc.h" namespace kaminpar { namespace static_array { @@ -295,16 +291,16 @@ template class StaticArray { void resize(const std::size_t size, const value_type init_value, Tags &&...tags) { KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always); if (size > 0 && (std::is_same_v || ...)) { - allocate_huge_data(size); + allocate_data(size, true); } else { - allocate_data(size); + allocate_data(size, false); } if constexpr (!(std::is_same_v || ...)) { if constexpr ((std::is_same_v || ...)) { assign(size, init_value, false); } else { - assign(size, init_value); + assign(size, init_value, true); } } } @@ -313,8 +309,8 @@ template class StaticArray { KASSERT(_data); if (assign_parallel) { - const std::size_t step{std::max(count / std::thread::hardware_concurrency(), 1UL)}; - tbb::parallel_for(0UL, count, step, [&](const size_type i) { + const std::size_t step = std::max(count / std::thread::hardware_concurrency(), 1UL); + tbb::parallel_for(0, count, step, [&](const size_type i) { for (size_type j = i; j < std::min(i + step, count); ++j) { _data[j] = value; } @@ -326,7 +322,7 @@ template class StaticArray { } } - std::unique_ptr free() { + parallel::tbb_unique_ptr free() { _size = 0; _unrestricted_size = 0; _data = nullptr; @@ -334,24 +330,8 @@ template class StaticArray { } private: - void allocate_huge_data(const std::size_t size) { -#ifdef KAMINPAR_ENABLE_HUGE_PAGES - _data = nullptr; - posix_memalign(reinterpret_cast(&_data), 1 << 21, size * sizeof(value_type)); - madvise(_data, size * sizeof(value_type), MADV_HUGEPAGE); - - _owned_data.reset(_data); - _size = size; - _unrestricted_size = _size; - - IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, size * sizeof(value_type))); -#else // KAMINPAR_ENABLE_HUGE_PAGES - allocate_data(size); -#endif // KAMINPAR_ENABLE_HUGE_PAGES - } - - void allocate_data(const std::size_t size) { - _owned_data = std::make_unique(size); + void allocate_data(const std::size_t size, const bool huge = false) { + _owned_data = parallel::make_unique(size, huge); _data = _owned_data.get(); _size = size; _unrestricted_size = _size; @@ -361,7 +341,7 @@ template class StaticArray { size_type _size = 0; size_type _unrestricted_size = 0; - std::unique_ptr _owned_data = nullptr; + parallel::tbb_unique_ptr _owned_data = nullptr; value_type *_data = nullptr; IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); diff --git a/kaminpar-common/parallel/tbb_malloc.h b/kaminpar-common/parallel/tbb_malloc.h index 421b6052..dbd6ba08 100644 --- a/kaminpar-common/parallel/tbb_malloc.h +++ b/kaminpar-common/parallel/tbb_malloc.h @@ -13,6 +13,10 @@ #include "kaminpar-common/assert.h" #include "kaminpar-common/heap_profiler.h" +#ifdef KAMINPAR_ENABLE_HUGE_PAGES +#include "sys/mman.h" +#endif // KAMINPAR_ENABLE_HUGE_PAGES + namespace kaminpar::parallel { template struct tbb_deleter { void operator()(T *p) { @@ -27,9 +31,21 @@ template struct tbb_deleter { template using tbb_unique_ptr = std::unique_ptr>; // template using tbb_unique_ptr = std::unique_ptr; -template tbb_unique_ptr make_unique(const std::size_t size) { +template +tbb_unique_ptr make_unique(const std::size_t size, const bool huge = false) { auto nbytes = sizeof(T) * size; - T *ptr = static_cast(scalable_malloc(nbytes)); + T *ptr = nullptr; + +#ifdef KAMINPAR_ENABLE_HUGE_PAGES + if (huge) { + scalable_posix_memalign(reinterpret_cast(&ptr), 1 << 21, nbytes); + madvise(ptr, nbytes, MADV_HUGEPAGE); + } else { +#endif + ptr = static_cast(scalable_malloc(nbytes)); +#ifdef KAMINPAR_ENABLE_HUGE_PAGES + } +#endif KASSERT( ptr != nullptr, "out of memory: could not allocate " << nbytes << " bytes", assert::light diff --git a/kaminpar-shm/graphutils/permutator.cc b/kaminpar-shm/graphutils/permutator.cc index 40d0d0b6..fb3ccf23 100644 --- a/kaminpar-shm/graphutils/permutator.cc +++ b/kaminpar-shm/graphutils/permutator.cc @@ -10,6 +10,7 @@ #include #include +#include #include "kaminpar-common/assert.h" #include "kaminpar-common/heap_profiler.h" @@ -27,7 +28,6 @@ NodePermutations rearrange_graph( StaticArray &edge_weights ) { START_HEAP_PROFILER("Temporal nodes and edges allocation"); - START_TIMER("Allocation (noinit)"); RECORD("tmp_nodes") StaticArray tmp_nodes(nodes.size(), static_array::huge, static_array::noinit); RECORD("tmp_edges") @@ -40,7 +40,6 @@ NodePermutations rearrange_graph( StaticArray tmp_edge_weights( edge_weights.size(), static_array::huge, static_array::noinit ); - STOP_TIMER(); STOP_HEAP_PROFILER(); // if we are about to remove all isolated nodes, we place them to the end of @@ -70,10 +69,12 @@ NodePermutations rearrange_graph( STOP_HEAP_PROFILER(); START_TIMER("Deallocation"); - tmp_nodes.free(); - tmp_edges.free(); - tmp_node_weights.free(); - tmp_edge_weights.free(); + tbb::parallel_invoke( + [&] { tmp_nodes.free(); }, + [&] { tmp_edges.free(); }, + [&] { tmp_node_weights.free(); }, + [&] { tmp_edge_weights.free(); } + ); STOP_TIMER(); return permutations; From 6e9516d454395582b7003202f136f45839c3632d Mon Sep 17 00:00:00 2001 From: Daniel Seemaier Date: Mon, 29 Apr 2024 11:19:28 +0200 Subject: [PATCH 3/3] feat: make thp the default --- CMakeLists.txt | 7 +++--- apps/io/shm_io.cc | 8 +++---- kaminpar-common/datastructures/static_array.h | 23 +++++++++++-------- kaminpar-common/parallel/tbb_malloc.h | 17 +++++++------- kaminpar-shm/graphutils/permutator.cc | 14 ++++------- kaminpar-shm/graphutils/permutator.h | 4 ++-- kaminpar-shm/kaminpar.cc | 8 +++---- kaminpar-shm/label_propagation.h | 2 +- 8 files changed, 42 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 959f7a62..69ad98e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,8 @@ option(KAMINPAR_ENABLE_PAGE_PROFILING "Profile pages allocated via mmap." OFF) option(KAMINPAR_ENABLE_STATISTICS "Generate and output detailed statistics." OFF) option(KAMINPAR_ENABLE_TIMERS "Measure running times. Must be set to 'OFF' if the library interface is used from multiple threads simulatinously." ON) option(KAMINPAR_ENABLE_TIMER_BARRIERS "Add additional MPI_Barrier() instructions for more accurate time measurements." ON) -option(KAMINPAR_ENABLE_HUGE_PAGES "Use huge pages for large allocations, if available." ON) + +option(KAMINPAR_ENABLE_THP "Use transparent huge pages for large memory allocations." ON) option(KAMINPAR_BUILD_WITH_ASAN "Enable address sanitizer." OFF) option(KAMINPAR_BUILD_WITH_UBSAN "Enable undefined behaviour sanitizer." OFF) @@ -204,8 +205,8 @@ else () message(STATUS "Timer barriers: disabled") endif () -if (KAMINPAR_ENABLE_HUGE_PAGES) - list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_HUGE_PAGES") +if (KAMINPAR_ENABLE_THP) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_THP") message(STATUS "Huge pages: enabled") else () message(STATUS "Huge pages: disabled") diff --git a/apps/io/shm_io.cc b/apps/io/shm_io.cc index c0dc37a2..7cbd2067 100644 --- a/apps/io/shm_io.cc +++ b/apps/io/shm_io.cc @@ -158,15 +158,15 @@ template CSRGraph csr_read(const std::string &filename, const boo store_node_weights = format.has_node_weights; store_edge_weights = format.has_edge_weights; - nodes.resize(format.number_of_nodes + 1, static_array::huge, static_array::noinit); - edges.resize(format.number_of_edges * 2, static_array::huge, static_array::noinit); + nodes.resize(format.number_of_nodes + 1, static_array::noinit); + edges.resize(format.number_of_edges * 2, static_array::noinit); if (store_node_weights) { - node_weights.resize(format.number_of_nodes, static_array::huge, static_array::noinit); + node_weights.resize(format.number_of_nodes, static_array::noinit); } if (store_edge_weights) { - edge_weights.resize(format.number_of_edges * 2, static_array::huge, static_array::noinit); + edge_weights.resize(format.number_of_edges * 2, static_array::noinit); } }, [&](const std::uint64_t weight) { diff --git a/kaminpar-common/datastructures/static_array.h b/kaminpar-common/datastructures/static_array.h index 21110281..70dce10e 100644 --- a/kaminpar-common/datastructures/static_array.h +++ b/kaminpar-common/datastructures/static_array.h @@ -20,14 +20,21 @@ #include "kaminpar-common/heap_profiler.h" #include "kaminpar-common/parallel/tbb_malloc.h" +#define KAMINPAR_THP_THRESHOLD 1024 * 1024 * 64 + namespace kaminpar { namespace static_array { +//! Tag for allocating memory, but not touching it. Without this tag, memory is initialized to the +//! default value for the given type. constexpr struct noinit_t { } noinit; -constexpr struct huge_t { -} huge; +//! Tag for small memory allocations that should never be backed by a transparent huge page. +constexpr struct small_t { +} small; +//! Tag for initializing memory sequentially. Without this tag, memory will be initialized by a +//! parallel loop. Has no effect if noinit is also passed. constexpr struct seq_t { } seq; } // namespace static_array @@ -290,11 +297,9 @@ template class StaticArray { template void resize(const std::size_t size, const value_type init_value, Tags &&...tags) { KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always); - if (size > 0 && (std::is_same_v || ...)) { - allocate_data(size, true); - } else { - allocate_data(size, false); - } + const bool use_thp = + (size >= KAMINPAR_THP_THRESHOLD && !(std::is_same_v || ...)); + allocate_data(size, use_thp); if constexpr (!(std::is_same_v || ...)) { if constexpr ((std::is_same_v || ...)) { @@ -330,8 +335,8 @@ template class StaticArray { } private: - void allocate_data(const std::size_t size, const bool huge = false) { - _owned_data = parallel::make_unique(size, huge); + void allocate_data(const std::size_t size, const bool thp) { + _owned_data = parallel::make_unique(size, thp); _data = _owned_data.get(); _size = size; _unrestricted_size = _size; diff --git a/kaminpar-common/parallel/tbb_malloc.h b/kaminpar-common/parallel/tbb_malloc.h index dbd6ba08..303928a4 100644 --- a/kaminpar-common/parallel/tbb_malloc.h +++ b/kaminpar-common/parallel/tbb_malloc.h @@ -13,9 +13,9 @@ #include "kaminpar-common/assert.h" #include "kaminpar-common/heap_profiler.h" -#ifdef KAMINPAR_ENABLE_HUGE_PAGES +#ifdef KAMINPAR_ENABLE_THP #include "sys/mman.h" -#endif // KAMINPAR_ENABLE_HUGE_PAGES +#endif // KAMINPAR_ENABLE_THP namespace kaminpar::parallel { template struct tbb_deleter { @@ -31,21 +31,20 @@ template struct tbb_deleter { template using tbb_unique_ptr = std::unique_ptr>; // template using tbb_unique_ptr = std::unique_ptr; -template -tbb_unique_ptr make_unique(const std::size_t size, const bool huge = false) { +template tbb_unique_ptr make_unique(const std::size_t size, const bool thp) { auto nbytes = sizeof(T) * size; T *ptr = nullptr; -#ifdef KAMINPAR_ENABLE_HUGE_PAGES - if (huge) { +#ifdef KAMINPAR_ENABLE_THP + if (thp) { scalable_posix_memalign(reinterpret_cast(&ptr), 1 << 21, nbytes); madvise(ptr, nbytes, MADV_HUGEPAGE); } else { -#endif +#endif // KAMINPAR_ENABLE_THP ptr = static_cast(scalable_malloc(nbytes)); -#ifdef KAMINPAR_ENABLE_HUGE_PAGES +#ifdef KAMINPAR_ENABLE_THP } -#endif +#endif // KAMINPAR_ENABLE_THP KASSERT( ptr != nullptr, "out of memory: could not allocate " << nbytes << " bytes", assert::light diff --git a/kaminpar-shm/graphutils/permutator.cc b/kaminpar-shm/graphutils/permutator.cc index fb3ccf23..faf14e44 100644 --- a/kaminpar-shm/graphutils/permutator.cc +++ b/kaminpar-shm/graphutils/permutator.cc @@ -29,17 +29,13 @@ NodePermutations rearrange_graph( ) { START_HEAP_PROFILER("Temporal nodes and edges allocation"); RECORD("tmp_nodes") - StaticArray tmp_nodes(nodes.size(), static_array::huge, static_array::noinit); + StaticArray tmp_nodes(nodes.size(), static_array::noinit); RECORD("tmp_edges") - StaticArray tmp_edges(edges.size(), static_array::huge, static_array::noinit); + StaticArray tmp_edges(edges.size(), static_array::noinit); RECORD("tmp_node_weights") - StaticArray tmp_node_weights( - node_weights.size(), static_array::huge, static_array::noinit - ); + StaticArray tmp_node_weights(node_weights.size(), static_array::noinit); RECORD("tmp_edge_weights") - StaticArray tmp_edge_weights( - edge_weights.size(), static_array::huge, static_array::noinit - ); + StaticArray tmp_edge_weights(edge_weights.size(), static_array::noinit); STOP_HEAP_PROFILER(); // if we are about to remove all isolated nodes, we place them to the end of @@ -369,7 +365,7 @@ PartitionedGraph assign_isolated_nodes( // The following call graph.n() should include isolated nodes now RECORD("partition") - StaticArray partition(graph.n(), static_array::huge, static_array::noinit); + StaticArray partition(graph.n(), static_array::noinit); // copy partition of non-isolated nodes tbb::parallel_for(0, num_nonisolated_nodes, [&](const NodeID u) { diff --git a/kaminpar-shm/graphutils/permutator.h b/kaminpar-shm/graphutils/permutator.h index 095561cb..b641d302 100644 --- a/kaminpar-shm/graphutils/permutator.h +++ b/kaminpar-shm/graphutils/permutator.h @@ -45,8 +45,8 @@ NodePermutations sort_by_degree_buckets(const StaticArray & const NodeID n = nodes.size() - 1; const int cpus = std::min(tbb::this_task_arena::max_concurrency(), n); - RECORD("permutation") StaticArray permutation(n, static_array::huge); - RECORD("inverse_permutation") StaticArray inverse_permutation(n, static_array::huge); + RECORD("permutation") StaticArray permutation(n); + RECORD("inverse_permutation") StaticArray inverse_permutation(n); // local_buckets[cpu][bucket]: thread-local bucket sizes using Buckets = std::array + 1>; diff --git a/kaminpar-shm/kaminpar.cc b/kaminpar-shm/kaminpar.cc index 5deb6e09..05a0b2fb 100644 --- a/kaminpar-shm/kaminpar.cc +++ b/kaminpar-shm/kaminpar.cc @@ -134,10 +134,10 @@ void KaMinPar::copy_graph( const bool has_node_weights = vwgt != nullptr; const bool has_edge_weights = adjwgt != nullptr; - RECORD("nodes") StaticArray nodes(n + 1, static_array::huge); - RECORD("edges") StaticArray edges(m, static_array::huge); - RECORD("node_weights") StaticArray node_weights(has_node_weights ? n : 0, static_array::huge); - RECORD("edge_weights") StaticArray edge_weights(has_edge_weights ? m : 0, static_array::huge); + RECORD("nodes") StaticArray nodes(n + 1); + RECORD("edges") StaticArray edges(m); + RECORD("node_weights") StaticArray node_weights(has_node_weights ? n : 0); + RECORD("edge_weights") StaticArray edge_weights(has_edge_weights ? m : 0); nodes[n] = xadj[n]; tbb::parallel_for(0, n, [&](const NodeID u) { diff --git a/kaminpar-shm/label_propagation.h b/kaminpar-shm/label_propagation.h index 9fb78ac2..3cc960ae 100644 --- a/kaminpar-shm/label_propagation.h +++ b/kaminpar-shm/label_propagation.h @@ -284,7 +284,7 @@ template class LabelPropagat } // Compute a mapping from old cluster IDs to new cluster IDs. - RECORD("mapping") StaticArray mapping(_graph->n(), static_array::huge); + RECORD("mapping") StaticArray mapping(_graph->n()); tbb::parallel_for(tbb::blocked_range(0, _graph->n()), [&](const auto &r) { for (NodeID u = r.begin(); u != r.end(); ++u) { const ClusterID c_u = derived_cluster(u);