diff --git a/.clangd b/.clangd index c7a47fe9..359a391e 100644 --- a/.clangd +++ b/.clangd @@ -1,2 +1,2 @@ CompileFlags: - Add: [-std=c++17] + Add: [-std=c++20] diff --git a/.gitignore b/.gitignore index b3bea2b5..e54d8ae0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ compile_commands.json +layout.kdl .idea +.vscode *~ cmake-build-* build*/ diff --git a/CMakeLists.txt b/CMakeLists.txt index dae02015..c0136f2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ project(KaMinPar set(PROJECT_VENDOR "Daniel Seemaier") set(PROJECT_CONTACT "daniel.seemaier@kit.edu") -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) ################################################################################ ## Options ## @@ -22,12 +22,15 @@ option(KAMINPAR_BUILD_TESTS "Build unit tests" ON) option(KAMINPAR_BUILD_DISTRIBUTED "Build distributed partitioner." OFF) option(KAMINPAR_BUILD_APPS "Build binaries." ON) option(KAMINPAR_BUILD_BENCHMARKS "Build benchmark binaries." OFF) +option(KAMINPAR_BUILD_TOOLS "Build tool binaries." OFF) option(KAMINPAR_BUILD_EXPERIMENTAL_FEATURES "Include experimental features in the build. This might increase compile times drastically." OFF) # Control how to build ###################### -option(KAMINPAR_ENABLE_STATISTICS "Generate and output detailed statistics." ON) +option(KAMINPAR_ENABLE_HEAP_PROFILING "Profile and output heap memory usage." OFF) +option(KAMINPAR_ENABLE_PAGE_PROFILING "Profile pages allocated via mmap." OFF) +option(KAMINPAR_ENABLE_STATISTICS "Generate and output detailed statistics." OFF) option(KAMINPAR_ENABLE_TIMERS "Measure running times. Must be set to 'OFF' if the library interface is used from multiple threads simulatinously." ON) option(KAMINPAR_ENABLE_TIMER_BARRIERS "Add additional MPI_Barrier() instructions for more accurate time measurements." ON) @@ -37,8 +40,22 @@ option(KAMINPAR_BUILD_WITH_MTUNE_NATIVE "Build with -mtune=native." ON) option(KAMINPAR_BUILD_WITH_CCACHE "Use ccache to build." ON) option(KAMINPAR_BUILD_WITH_DEBUG_SYMBOLS "Always build with debug symbols, even in Release mode." ON) option(KAMINPAR_BUILD_WITH_MTKAHYPAR "If Mt-KaHyPar can be found, build the Mt-KaHyPar initial partitioner." OFF) +option(KAMINPAR_BUILD_WITH_GROWT "Build the shared-memory partitioner with Growt." ON) option(KAMINPAR_BUILD_WITH_PG "Build with the -pg option for profiling." OFF) +# Control graph compression options +################################### +option(KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING "Use high-degree encoding for the compressed graph." ON) +option(KAMINPAR_COMPRESSION_INTERVAL_ENCODING "Use interval encoding for the compressed graph." ON) +option(KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING "Use run-length encoding for the compressed graph." OFF) +option(KAMINPAR_COMPRESSION_STREAM_ENCODING "Use stream encoding for the compressed graph." OFF) +option(KAMINPAR_COMPRESSION_FAST_DECODING "Use fast decoding for the compressed graph." OFF) +option(KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION "Whether all isolated nodes are the last nodes of the input graph" OFF) + +if (KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING AND KAMINPAR_COMPRESSION_STREAM_ENCODING) + message(FATAL_ERROR "Either run-length or stream encoding can be used for varints but not both.") +endif () + # Control data type sizes ######################### @@ -109,7 +126,26 @@ if (KAMINPAR_BUILD_WITH_DEBUG_SYMBOLS) add_compile_options(-g -g3) endif () -# Set compile flags +# Set compile flags +add_compile_options(-msse4.1) + +check_cxx_compiler_flag(-mcx16 COMPILER_SUPPORTS_MCX16) +if (COMPILER_SUPPORTS_MCX16) + add_compile_options(-mcx16) +else () + message(WARNING "-mcx16 flag not supported by the compiler") + + if (KAMINPAR_BUILD_WITH_GROWT) + message(WARNING "-mcx16 flag not supported by the compiler: cannot use growt for the shared-memory partitioner") + set(KAMINPAR_BUILD_WITH_GROWT OFF) + endif () + + if (KAMINPAR_BUILD_DISTRIBUTED) + message(WARNING "-mcx16 flag not supported by the compiler: cannot build the distributed partitioner") + set(KAMINPAR_BUILD_DISTRIBUTED OFF) + endif () +endif () + if (KAMINPAR_BUILD_WITH_MTUNE_NATIVE) add_compile_options(-mtune=native -march=native) endif () @@ -133,7 +169,24 @@ if (KAMINPAR_ENABLE_STATISTICS) list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_STATISTICS") message(STATUS "Statistics: enabled") else () - message(STATIS "Statistics: disabled") + message(STATUS "Statistics: disabled") +endif () + +if (KAMINPAR_ENABLE_HEAP_PROFILING) + string(LENGTH "${CMAKE_SOURCE_DIR}/" SOURCE_PATH_SIZE) + + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_HEAP_PROFILING") + list(APPEND KAMINPAR_DEFINITIONS "-DSOURCE_PATH_SIZE=${SOURCE_PATH_SIZE}") + message(STATUS "Heap Profiling: enabled") +else () + message(STATUS "Heap Profiling: disabled") +endif () + +if (KAMINPAR_ENABLE_PAGE_PROFILING) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_PAGE_PROFILING") + message(STATUS "Page Profiling: enabled") +else () + message(STATUS "Page Profiling: disabled") endif () if (KAMINPAR_ENABLE_TIMERS) @@ -150,6 +203,51 @@ else () message(STATUS "Timer barriers: disabled") endif () +message(STATUS "Graph compression summary:") + +if (KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING") + message(" High-degree encoding: enabled") +else () + message(" High-degree encoding: disabled") +endif () + +if (KAMINPAR_COMPRESSION_INTERVAL_ENCODING) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_INTERVAL_ENCODING") + message(" Interval encoding: enabled") +else () + message(" Interval encoding: disabled") +endif () + +if (KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING") + message(" Run-length encoding: enabled") +else () + message(" Run-length encoding: disabled") +endif () + +if (KAMINPAR_COMPRESSION_STREAM_ENCODING) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_STREAM_ENCODING") + message(" Stream encoding: enabled") +else () + message(" Stream encoding: disabled") +endif () + +if (KAMINPAR_COMPRESSION_FAST_DECODING) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_FAST_DECODING") + add_compile_options(-mbmi2) + message(" Fast decoding: enabled") +else () + message(" Fast decoding: disabled") +endif () + +if (KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION") + message(" Isolated nodes separation: enabled") +else () + message(" Isolated nodes separation: disabled") +endif () + if (KAMINPAR_64BIT_NODE_IDS OR KAMINPAR_64BIT_IDS) list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_64BIT_NODE_IDS") set(KAMINPAR_SHM_NODE_ID_STR "std::uint64_t") @@ -203,6 +301,14 @@ if (KAMINPAR_BUILD_WITH_CCACHE) endif () endif () +if (KAMINPAR_BUILD_WITH_GROWT) + list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_USES_GROWT") + + add_subdirectory(external/growt EXCLUDE_FROM_ALL) + add_library(growt INTERFACE) + target_include_directories(growt SYSTEM INTERFACE "external/growt") +endif () + if (KAMINPAR_BUILD_DISTRIBUTED) # MPI set(MPI_DETERMINE_LIBRARY_VERSION TRUE) @@ -212,19 +318,6 @@ if (KAMINPAR_BUILD_DISTRIBUTED) set(KAMINPAR_BUILD_DISTRIBUTED OFF) endif () - # Growt (needs -mcx16, i.e., does not work on ARM) - check_cxx_compiler_flag(-mcx16 COMPILER_SUPPORTS_MCX16) - if (COMPILER_SUPPORTS_MCX16) - add_compile_options(-mcx16) - else () - message(WARNING "-mcx16 flag not supported by the compiler: cannot build the distributed partitioner") - set(KAMINPAR_BUILD_DISTRIBUTED OFF) - endif() - - add_subdirectory(external/growt EXCLUDE_FROM_ALL) - add_library(growt INTERFACE) - target_include_directories(growt SYSTEM INTERFACE "external/growt") - # Google Sparsehash find_package(Sparsehash REQUIRED) endif () @@ -290,7 +383,7 @@ endif () # Unit tests if (KAMINPAR_BUILD_TESTS) - add_subdirectory(external/googletest EXCLUDE_FROM_ALL) + add_subdirectory(external/googletest EXCLUDE_FROM_ALL SYSTEM) enable_testing() add_subdirectory(tests) diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 00000000..c1ccc7ce --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,60 @@ +{ + "version": 6, + "cmakeMinimumRequired": { + "major": 3, + "minor": 21, + "patch": 0 + }, + "configurePresets": [ + { + "name": "default", + "displayName": "Default Config", + "cacheVariables": { + "KAMINPAR_64BIT_IDS": "OFF", + "KAMINPAR_64BIT_EDGE_IDS": "OFF", + "KAMINPAR_64BIT_NODE_IDS": "OFF", + "KAMINPAR_64BIT_WEIGHTS": "OFF" + } + }, + { + "name": "distributed", + "displayName": "Default Config for dKaMinPar", + "cacheVariables": { + "KAMINPAR_BUILD_DISTRIBUTED": "ON", + "KAMINPAR_64BIT_IDS": "OFF", + "KAMINPAR_64BIT_EDGE_IDS": "OFF", + "KAMINPAR_64BIT_NODE_IDS": "OFF", + "KAMINPAR_64BIT_WEIGHTS": "ON" + } + }, + { + "name": "compressed", + "displayName": "Default Config for KaMinPar with Memory Optimizations", + "cacheVariables": { + "KAMINPAR_64BIT_EDGE_IDS": "ON", + "KAMINPAR_64BIT_WEIGHTS": "ON" + } + }, + { + "name": "stats", + "displayName": "Default Config for KaMinPar with Statistics", + "cacheVariables": { + "KAMINPAR_ENABLE_STATISTICS": "ON", + "KAMINPAR_ENABLE_HEAP_PROFILING": "ON" + } + }, + + { + "name": "default-stats", + "inherits": ["default", "stats"] + }, + { + "name": "compressed-stats", + "inherits": ["compressed", "stats"] + }, + { + "name": "distributed-stats", + "inherits": ["distributed", "stats"] + } + ] +} diff --git a/README.MD b/README.MD index c47f890e..32c39592 100644 --- a/README.MD +++ b/README.MD @@ -21,7 +21,7 @@ Moreover, for large values of k, it is an order of magnitude faster than competi Build KaMinPar following the standard CMake steps: ```shell -cmake -B build -DCMAKE_BUILD_TYPE=Release -DKAMINPAR_BUILD_DISTRIBUTED=On +cmake -B build -DCMAKE_BUILD_TYPE=Release --preset= cmake --build build --parallel ``` @@ -43,7 +43,7 @@ Presets can be viewed by using the `--dump-config` flag; to use a custom preset, ```shell # Write the default preset to a file -./KaMinPar [-P default|strong|largek] --dump-config > my_preset.ini +./KaMinPar -P default --dump-config > my_preset.ini # ... modify the configuration by editing my_preset.ini ... diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt index df88f15d..c219b590 100644 --- a/apps/CMakeLists.txt +++ b/apps/CMakeLists.txt @@ -1,7 +1,23 @@ +set(KAMINPAR_IO_SOURCE_FILES + io/parhip_parser.h + io/parhip_parser.cc + io/shm_compressed_graph_binary.h + io/shm_compressed_graph_binary.cc + io/shm_input_validator.h + io/shm_input_validator.cc + io/shm_io.h + io/shm_io.cc) + +add_library(kaminpar_io ${KAMINPAR_IO_SOURCE_FILES}) +target_include_directories(kaminpar_io PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/io/../") +target_link_libraries(kaminpar_io PUBLIC KaMinPar::KaMinPar KaMinPar::KaMinParCLI11) + +add_library(KaMinPar::KaMinParIO ALIAS kaminpar_io) + function(add_shm_app target) add_executable(${target} ${ARGN}) target_include_directories(${target} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) - target_link_libraries(${target} PRIVATE KaMinPar::KaMinPar KaMinPar::KaMinParCLI11) + target_link_libraries(${target} PRIVATE KaMinPar::KaMinPar KaMinPar::KaMinParCLI11 KaMinPar::KaMinParIO) install(TARGETS ${target}) message(STATUS "Enabled app: ${target}") endfunction() @@ -16,9 +32,6 @@ function(add_dist_app target) endfunction() add_shm_app(KaMinPar KaMinPar.cc) -target_sources(KaMinPar PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/io/shm_io.cc - ${CMAKE_CURRENT_SOURCE_DIR}/io/shm_input_validator.cc) if (TARGET kaminpar_dist) add_dist_app(dKaMinPar dKaMinPar.cc) @@ -31,3 +44,6 @@ if (KAMINPAR_BUILD_BENCHMARKS) add_subdirectory(benchmarks) endif () +if (KAMINPAR_BUILD_TOOLS) + add_subdirectory(tools) +endif () diff --git a/apps/KaMinPar.cc b/apps/KaMinPar.cc index 2d011c3e..94115f04 100644 --- a/apps/KaMinPar.cc +++ b/apps/KaMinPar.cc @@ -13,16 +13,17 @@ #include #include -#include #if __has_include() #include #endif // __has_include() +#include "kaminpar-shm/datastructures/graph.h" + #include "kaminpar-common/environment.h" +#include "kaminpar-common/heap_profiler.h" #include "kaminpar-common/strutils.h" -#include "apps/io/shm_input_validator.h" #include "apps/io/shm_io.h" using namespace kaminpar; @@ -38,14 +39,21 @@ struct ApplicationContext { int max_timer_depth = 3; + bool heap_profiler_detailed = false; + int heap_profiler_max_depth = 3; + bool heap_profiler_print_structs = true; + float heap_profiler_min_struct_size = 10; + BlockID k = 0; bool quiet = false; bool experiment = false; bool validate = false; + bool debug = false; std::string graph_filename = ""; std::string partition_filename = ""; + io::GraphFileFormat graph_file_format = io::GraphFileFormat::METIS; }; void setup_context(CLI::App &cli, ApplicationContext &app, Context &ctx) { @@ -87,12 +95,59 @@ The output should be stored in a file and can be used by the -C,--config option. ->check(CLI::NonNegativeNumber) ->default_val(app.num_threads); cli.add_flag("-E,--experiment", app.experiment, "Use an output format that is easier to parse."); + cli.add_flag( + "-D,--debug", + app.debug, + "Same as -E, but print additional debug information (that might impose a running time " + "penalty)." + ); cli.add_option( "--max-timer-depth", app.max_timer_depth, "Set maximum timer depth shown in result summary." ); cli.add_flag_function("-T,--all-timers", [&](auto) { app.max_timer_depth = std::numeric_limits::max(); }); + cli.add_option("-f,--graph-file-format", app.graph_file_format) + ->transform(CLI::CheckedTransformer(io::get_graph_file_formats()).description("")) + ->description(R"(Graph file formats: + - metis + - parhip)") + ->capture_default_str(); + + if constexpr (kHeapProfiling) { + auto *hp_group = cli.add_option_group("Heap Profiler"); + + hp_group + ->add_flag( + "-H,--hp-print-detailed", + app.heap_profiler_detailed, + "Show all levels and data structures in the result summary." + ) + ->default_val(app.heap_profiler_detailed); + hp_group + ->add_option( + "--hp-max-depth", + app.heap_profiler_max_depth, + "Set maximum heap profiler depth shown in the result summary." + ) + ->default_val(app.heap_profiler_max_depth); + hp_group + ->add_option( + "--hp-print-structs", + app.heap_profiler_print_structs, + "Print data structure memory statistics in the result summary." + ) + ->default_val(app.heap_profiler_print_structs); + hp_group + ->add_option( + "--hp-min-struct-size", + app.heap_profiler_min_struct_size, + "Sets the minimum size of a data structure in MB to be included in the result summary." + ) + ->default_val(app.heap_profiler_min_struct_size) + ->check(CLI::NonNegativeNumber); + } + cli.add_option("-o,--output", app.partition_filename, "Output filename for the graph partition.") ->capture_default_str(); cli.add_flag( @@ -132,26 +187,27 @@ int main(int argc, char *argv[]) { std::exit(0); } - // Allocate graph data structures and read graph file - StaticArray xadj; - StaticArray adjncy; - StaticArray vwgt; - StaticArray adjwgt; - - if (app.validate) { - shm::io::metis::read(app.graph_filename, xadj, adjncy, vwgt, adjwgt); - shm::validate_undirected_graph(xadj, adjncy, vwgt, adjwgt); - } else { - shm::io::metis::read(app.graph_filename, xadj, adjncy, vwgt, adjwgt); + if (ctx.compression.enabled && ctx.node_ordering == NodeOrdering::DEGREE_BUCKETS) { + std::cout << "The nodes of the compressed graph cannot be rearranged by degree buckets!" + << std::endl; + std::exit(0); } - const NodeID n = static_cast(xadj.size() - 1); - std::vector partition(n); - - EdgeID *xadj_ptr = xadj.data(); - NodeID *adjncy_ptr = adjncy.data(); - NodeWeight *vwgt_ptr = !vwgt.empty() ? vwgt.data() : nullptr; - EdgeWeight *adjwgt_ptr = !adjwgt.empty() ? adjwgt.data() : nullptr; + ENABLE_HEAP_PROFILER(); + + // Read the input graph and allocate memory for the partition + START_HEAP_PROFILER("Input Graph Allocation"); + Graph graph = io::read( + app.graph_filename, + app.graph_file_format, + ctx.compression.enabled, + ctx.compression.may_dismiss, + ctx.node_ordering == NodeOrdering::IMPLICIT_DEGREE_BUCKETS, + app.validate + ); + RECORD("partition") std::vector partition(graph.n()); + RECORD_LOCAL_DATA_STRUCT("vector", partition.capacity() * sizeof(BlockID)); + STOP_HEAP_PROFILER(); // Compute graph partition KaMinPar partitioner(app.num_threads, ctx); @@ -159,14 +215,26 @@ int main(int argc, char *argv[]) { if (app.quiet) { partitioner.set_output_level(OutputLevel::QUIET); + } else if (app.debug) { + partitioner.set_output_level(OutputLevel::DEBUG); } else if (app.experiment) { partitioner.set_output_level(OutputLevel::EXPERIMENT); } partitioner.context().debug.graph_name = str::extract_basename(app.graph_filename); partitioner.set_max_timer_depth(app.max_timer_depth); - partitioner.take_graph(n, xadj_ptr, adjncy_ptr, vwgt_ptr, adjwgt_ptr); + if constexpr (kHeapProfiling) { + auto &global_heap_profiler = heap_profiler::HeapProfiler::global(); + if (app.heap_profiler_detailed) { + global_heap_profiler.set_detailed_summary_options(); + } else { + global_heap_profiler.set_max_depth(app.heap_profiler_max_depth); + global_heap_profiler.set_print_data_structs(app.heap_profiler_print_structs); + global_heap_profiler.set_min_data_struct_size(app.heap_profiler_min_struct_size); + } + } + partitioner.set_graph(std::move(graph)); partitioner.compute_partition(app.k, partition.data()); // Save graph partition @@ -174,5 +242,7 @@ int main(int argc, char *argv[]) { shm::io::partition::write(app.partition_filename, partition); } + DISABLE_HEAP_PROFILER(); + return 0; } diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt index 3115d6b7..29cda36a 100644 --- a/apps/benchmarks/CMakeLists.txt +++ b/apps/benchmarks/CMakeLists.txt @@ -3,12 +3,16 @@ function(add_shm_benchmark target) target_link_libraries(${target} PRIVATE KaMinPar::KaMinPar KaMinPar::KaMinParCLI11 + KaMinPar::KaMinParIO KaGen::KaGen) message(STATUS "Enabled benchmark: ${target}") endfunction() # Shared-memory benchmarks +add_shm_benchmark(shm_compressed_graph_benchmark shm_compressed_graph_benchmark.cc) +add_shm_benchmark(shm_label_propagation_benchmark shm_label_propagation_benchmark.cc) add_shm_benchmark(shm_refinement_benchmark shm_refinement_benchmark.cc) +add_shm_benchmark(shm_variable_length_codec_benchmark shm_variable_length_codec_benchmark.cc) add_shm_benchmark(shm_gain_cache_benchmark shm_gain_cache_benchmark.cc) if (KAMINPAR_BUILD_DISTRIBUTED) diff --git a/apps/benchmarks/shm_compressed_graph_benchmark.cc b/apps/benchmarks/shm_compressed_graph_benchmark.cc new file mode 100644 index 00000000..74490f9c --- /dev/null +++ b/apps/benchmarks/shm_compressed_graph_benchmark.cc @@ -0,0 +1,550 @@ +/******************************************************************************* + * Graph compression benchmark for the shared-memory algorithm. + * + * @file: shm_compressed_graph_benchmark.cc + * @author: Daniel Salwasser + * @date: 12.11.2023 + ******************************************************************************/ +#include "kaminpar-cli/CLI11.h" + +#include "kaminpar-shm/datastructures/graph.h" +#include "kaminpar-shm/graphutils/permutator.h" + +#include "kaminpar-common/console_io.h" +#include "kaminpar-common/heap_profiler.h" +#include "kaminpar-common/logger.h" +#include "kaminpar-common/timer.h" + +#include "apps/io/shm_io.h" + +using namespace kaminpar; +using namespace kaminpar::shm; + +static std::string to_megabytes(std::size_t bytes) { + std::stringstream stream; + stream << std::fixed << std::setprecision(2) << (bytes / (float)(1024 * 1024)); + return stream.str(); +} + +template static bool operator!=(const IotaRange &a, const IotaRange &b) { + if (a.begin() == a.end()) { + return b.begin() != b.end(); + } + + return a.begin() != b.begin() || a.end() != b.end(); +}; + +// See https://github.com/google/benchmark/blob/main/include/benchmark/benchmark.h +template static inline void do_not_optimize(T value) { + asm volatile("" : "+m"(value) : : "memory"); +} + +template static void benchmark_degree(const Graph &graph) { + SCOPED_HEAP_PROFILER("Degree"); + SCOPED_TIMER("Degree"); + + for (const auto node : graph.nodes()) { + do_not_optimize(graph.degree(node)); + } +} + +template static void benchmark_incident_edges(const Graph &graph) { + SCOPED_HEAP_PROFILER("Incident Edges"); + SCOPED_TIMER("Incident Edges"); + + for (const auto node : graph.nodes()) { + for (const auto incident_edge : graph.incident_edges(node)) { + do_not_optimize(incident_edge); + } + } +} + +template static void benchmark_adjacent_nodes(const Graph &graph) { + SCOPED_HEAP_PROFILER("Adjacent Nodes"); + SCOPED_TIMER("Adjacent Nodes"); + + for (const auto node : graph.nodes()) { + graph.adjacent_nodes(node, [&](const auto adjacent_node) { do_not_optimize(adjacent_node); }); + } +} + +template static void benchmark_neighbors(const Graph &graph) { + SCOPED_HEAP_PROFILER("Neighbors"); + SCOPED_TIMER("Neighbors"); + + for (const auto node : graph.nodes()) { + graph.neighbors(node, [](const auto incident_edge, const auto adjacent_node) { + do_not_optimize(incident_edge); + do_not_optimize(adjacent_node); + }); + } +} + +template static void benchmark_pfor_neighbors(const Graph &graph) { + SCOPED_HEAP_PROFILER("Parallel For Neighbors"); + SCOPED_TIMER("Parallel For Neighbors"); + + for (const auto node : graph.nodes()) { + graph.pfor_neighbors( + node, + std::numeric_limits::max(), + 1000, + [](const auto incident_edge, const auto adjacent_node) { + do_not_optimize(incident_edge); + do_not_optimize(adjacent_node); + } + ); + } +} + +static void expect_equal_size(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + if (graph.n() != compressed_graph.n()) { + LOG_ERROR << "The uncompressed graph has " << graph.n() + << " nodes and the compressed graph has " << compressed_graph.n() << " nodes!"; + return; + } + + if (graph.m() != compressed_graph.m()) { + LOG_ERROR << "The uncompressed graph has " << graph.m() + << " edges and the compressed graph has " << compressed_graph.m() << " edges!"; + return; + } +} + +static void expect_equal_nodes(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + if (graph.nodes() != compressed_graph.nodes()) { + LOG_ERROR << "The nodes of the compressed and uncompressed graph do not match!"; + return; + } +} + +static void expect_equal_edges(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + if (graph.edges() != compressed_graph.edges()) { + LOG_ERROR << "The edges of the compressed and uncompressed graph do not match!"; + return; + } +} + +static void expect_equal_degree(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + for (const auto node : graph.nodes()) { + if (graph.degree(node) != compressed_graph.degree(node)) { + LOG_ERROR << "The node " << node << " has degree " << compressed_graph.degree(node) + << " in the compressed graph and degree " << graph.degree(node) + << " in the uncompressed graph!"; + return; + } + } +} + +static void +expect_equal_incident_edges(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + for (const auto node : graph.nodes()) { + if (graph.incident_edges(node) != compressed_graph.incident_edges(node)) { + LOG_ERROR << "The incident edges of node " << node + << " in the compressed and uncompressed graph do not match!"; + return; + } + } +} + +static void +expect_equal_adjacent_nodes(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + std::vector graph_neighbours; + std::vector compressed_graph_neighbours; + + for (const NodeID node : graph.nodes()) { + graph.adjacent_nodes(node, [&](const NodeID adjacent_node) { + graph_neighbours.push_back(adjacent_node); + }); + + compressed_graph.adjacent_nodes(node, [&](const NodeID adjacent_node) { + compressed_graph_neighbours.push_back(adjacent_node); + }); + + if (graph_neighbours.size() != compressed_graph_neighbours.size()) { + LOG_ERROR << "Node " << node << " has " << graph_neighbours.size() + << " neighbours in the uncompressed graph but " + << compressed_graph_neighbours.size() << " neighbours in the compressed graph!"; + return; + } + + std::sort(graph_neighbours.begin(), graph_neighbours.end()); + std::sort(compressed_graph_neighbours.begin(), compressed_graph_neighbours.end()); + if (graph_neighbours != compressed_graph_neighbours) { + LOG_ERROR << "The neighbourhood of node " << node + << " in the compressed and uncompressed graph does not match!"; + return; + } + + graph_neighbours.clear(); + compressed_graph_neighbours.clear(); + } +} + +static void +expect_equal_neighbours(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + std::vector graph_incident_edges; + std::vector graph_adjacent_node; + std::vector compressed_graph_incident_edges; + std::vector compressed_graph_adjacent_node; + + for (const NodeID node : graph.nodes()) { + graph.neighbors(node, [&](const auto incident_edge, const auto adjacent_node) { + graph_incident_edges.push_back(incident_edge); + graph_adjacent_node.push_back(adjacent_node); + }); + + compressed_graph.neighbors(node, [&](const auto incident_edge, const auto adjacent_node) { + compressed_graph_incident_edges.push_back(incident_edge); + compressed_graph_adjacent_node.push_back(adjacent_node); + }); + + if (graph_incident_edges.size() != compressed_graph_incident_edges.size()) { + LOG_ERROR << "Node " << node << " has " << graph_incident_edges.size() + << " neighbours in the uncompressed graph but " + << compressed_graph_incident_edges.size() << " neighbours in the compressed graph!"; + return; + } + + std::sort(graph_incident_edges.begin(), graph_incident_edges.end()); + std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end()); + std::sort(compressed_graph_incident_edges.begin(), compressed_graph_incident_edges.end()); + std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end()); + + if (graph_incident_edges != compressed_graph_incident_edges) { + LOG_ERROR << "The incident edges of node " << node + << " in the compressed and uncompressed graph do not match!"; + return; + } + + if (graph_adjacent_node != compressed_graph_adjacent_node) { + LOG_ERROR << "The adjacent nodes of node " << node + << " in the compressed and uncompressed graph do not match!"; + return; + } + + graph_incident_edges.clear(); + graph_adjacent_node.clear(); + compressed_graph_incident_edges.clear(); + compressed_graph_adjacent_node.clear(); + } +} + +static void expect_equal_neighbours_max(CSRGraph &graph, const CompressedGraph &compressed_graph) { + std::vector graph_incident_edges; + std::vector graph_adjacent_node; + std::vector compressed_graph_incident_edges; + std::vector compressed_graph_adjacent_node; + + graph::reorder_edges_by_compression(graph); + + for (const NodeID node : graph.nodes()) { + const NodeID max_neighbor_count = graph.degree(node) / 2; + + graph.neighbors( + node, + max_neighbor_count, + [&](const auto incident_edge, const auto adjacent_node) { + graph_incident_edges.push_back(incident_edge); + graph_adjacent_node.push_back(adjacent_node); + } + ); + + compressed_graph.neighbors( + node, + max_neighbor_count, + [&](const auto incident_edge, const auto adjacent_node) { + compressed_graph_incident_edges.push_back(incident_edge); + compressed_graph_adjacent_node.push_back(adjacent_node); + } + ); + + if (graph_incident_edges.size() != compressed_graph_incident_edges.size()) { + LOG_ERROR << "Node " << node << " has " << graph_incident_edges.size() + << " neighbours in the uncompressed graph but " + << compressed_graph_incident_edges.size() << " neighbours in the compressed graph!"; + return; + } + + std::sort(graph_incident_edges.begin(), graph_incident_edges.end()); + std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end()); + std::sort(compressed_graph_incident_edges.begin(), compressed_graph_incident_edges.end()); + std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end()); + + if (graph_incident_edges != compressed_graph_incident_edges) { + LOG_ERROR << "The incident edges of node " << node + << " in the compressed and uncompressed graph do not match!"; + return; + } + + if (graph_adjacent_node != compressed_graph_adjacent_node) { + LOG_ERROR << "The adjacent nodes of node " << node + << " in the compressed and uncompressed graph do not match!"; + return; + } + + graph_incident_edges.clear(); + graph_adjacent_node.clear(); + compressed_graph_incident_edges.clear(); + compressed_graph_adjacent_node.clear(); + } +} + +static void +expect_equal_pfor_neighbors(const CSRGraph &graph, const CompressedGraph &compressed_graph) { + tbb::concurrent_vector graph_adjacent_node; + tbb::concurrent_vector compressed_graph_adjacent_node; + + for (const NodeID node : graph.nodes()) { + graph.pfor_neighbors( + node, + std::numeric_limits::max(), + std::numeric_limits::max(), + [&](const EdgeID e, const NodeID v) { graph_adjacent_node.push_back(v); } + ); + + compressed_graph.pfor_neighbors( + node, + std::numeric_limits::max(), + std::numeric_limits::max(), + [&](const EdgeID e, const NodeID v) { compressed_graph_adjacent_node.push_back(v); } + ); + + if (graph_adjacent_node.size() != compressed_graph_adjacent_node.size()) { + LOG_ERROR << "Node " << node << " has " << graph_adjacent_node.size() + << " adjacent nodes in the uncompressed graph but " + << compressed_graph_adjacent_node.size() + << " adjacent node in the compressed graph!"; + return; + } + + std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end()); + std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end()); + + if (graph_adjacent_node != compressed_graph_adjacent_node) { + LOG_ERROR << "The adjacent nodes of node " << node + << " in the compressed and uncompressed graph do not match!"; + return; + } + + graph_adjacent_node.clear(); + compressed_graph_adjacent_node.clear(); + } +} + +static void expect_equal_compressed_graph_edge_weights( + const CSRGraph &graph, const CompressedGraph &compressed_graph +) { + std::vector> csr_graph_edge_weights; + std::vector> compressed_graph_edge_weights; + + for (const NodeID node : graph.nodes()) { + graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) { + csr_graph_edge_weights.emplace_back(adjacent_node, graph.edge_weight(incident_edge)); + }); + + compressed_graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) { + compressed_graph_edge_weights.emplace_back( + adjacent_node, compressed_graph.edge_weight(incident_edge) + ); + }); + + if (csr_graph_edge_weights.size() != compressed_graph_edge_weights.size()) { + LOG_ERROR << "Node " << node << " has " << csr_graph_edge_weights.size() + << " adjacent nodes in the uncompressed graph but " + << compressed_graph_edge_weights.size() + << " adjacent node in the compressed graph!"; + return; + } + + std::sort( + csr_graph_edge_weights.begin(), + csr_graph_edge_weights.end(), + [](const auto &a, const auto &b) { return a.first < b.first; } + ); + + std::sort( + compressed_graph_edge_weights.begin(), + compressed_graph_edge_weights.end(), + [](const auto &a, const auto &b) { return a.first < b.first; } + ); + + if (csr_graph_edge_weights != compressed_graph_edge_weights) { + LOG_ERROR << "The edge weights of node " << node + << " in the compressed and uncompressed graph do not match!"; + return; + } + + csr_graph_edge_weights.clear(); + compressed_graph_edge_weights.clear(); + } +} + +static void expect_equal_rearrange_compressed_edge_weights( + CSRGraph &graph, const CompressedGraph &compressed_graph +) { + graph::reorder_edges_by_compression(graph); + + for (const NodeID node : graph.nodes()) { + for (const auto [incident_edge, adjacent_node] : graph.neighbors(node)) { + if (graph.edge_weight(incident_edge) != compressed_graph.edge_weight(incident_edge)) { + LOG_ERROR << "Edge " << incident_edge << " has weight " << graph.edge_weight(incident_edge) + << " in the rearranged uncompressed graph but weight " + << compressed_graph.edge_weight(incident_edge) << " in the compressed graph!"; + return; + } + } + } +} + +static void run_checks(CSRGraph &graph, const CompressedGraph &compressed_graph) { + LOG << "Checking if the graph operations are valid..."; + + expect_equal_size(graph, compressed_graph); + expect_equal_nodes(graph, compressed_graph); + expect_equal_edges(graph, compressed_graph); + expect_equal_degree(graph, compressed_graph); + expect_equal_incident_edges(graph, compressed_graph); + expect_equal_adjacent_nodes(graph, compressed_graph); + expect_equal_neighbours(graph, compressed_graph); + expect_equal_neighbours_max(graph, compressed_graph); + expect_equal_pfor_neighbors(graph, compressed_graph); + expect_equal_compressed_graph_edge_weights(graph, compressed_graph); + expect_equal_rearrange_compressed_edge_weights(graph, compressed_graph); +} + +static void run_benchmark(CSRGraph graph, CompressedGraph compressed_graph) { + LOG << "Running the benchmark..."; + + START_HEAP_PROFILER("Uncompressed graph operations"); + TIMED_SCOPE("Uncompressed graph operations") { + benchmark_degree(graph); + benchmark_incident_edges(graph); + benchmark_adjacent_nodes(graph); + benchmark_neighbors(graph); + benchmark_pfor_neighbors(graph); + }; + STOP_HEAP_PROFILER(); + + START_HEAP_PROFILER("Compressed graph operations"); + TIMED_SCOPE("Compressed graph operations") { + benchmark_degree(compressed_graph); + benchmark_incident_edges(compressed_graph); + benchmark_adjacent_nodes(compressed_graph); + benchmark_neighbors(compressed_graph); + benchmark_pfor_neighbors(compressed_graph); + }; + STOP_HEAP_PROFILER(); + + Graph graph_csr(std::make_unique(std::move(graph))); + START_HEAP_PROFILER("Uncompressed underlying graph operations"); + TIMED_SCOPE("Uncompressed underlying graph operations") { + benchmark_degree(graph_csr); + benchmark_incident_edges(graph_csr); + benchmark_adjacent_nodes(graph_csr); + benchmark_neighbors(graph_csr); + benchmark_pfor_neighbors(graph_csr); + }; + STOP_HEAP_PROFILER(); + + Graph graph_compressed(std::make_unique(std::move(compressed_graph))); + START_HEAP_PROFILER("Compressed underlying graph operations"); + TIMED_SCOPE("Compressed underlying graph operations") { + benchmark_degree(graph_compressed); + benchmark_incident_edges(graph_compressed); + benchmark_adjacent_nodes(graph_compressed); + benchmark_neighbors(graph_compressed); + benchmark_pfor_neighbors(graph_compressed); + }; + STOP_HEAP_PROFILER(); +} + +int main(int argc, char *argv[]) { + // Parse CLI arguments + std::string graph_filename; + int num_threads = 1; + bool enable_benchmarks = true; + bool enable_checks = false; + + CLI::App app("Shared-memory graph compression benchmark"); + app.add_option("-G,--graph", graph_filename, "Graph file")->required(); + app.add_option("-t,--threads", num_threads, "Number of threads") + ->check(CLI::NonNegativeNumber) + ->default_val(num_threads); + app.add_option("-b,--benchmark", enable_benchmarks, "Enable graph operations benchmark") + ->default_val(enable_benchmarks); + app.add_option("-c,--checks", enable_checks, "Enable compressed graph operations check") + ->default_val(enable_checks); + + CLI11_PARSE(app, argc, argv); + + tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads); + + ENABLE_HEAP_PROFILER(); + GLOBAL_TIMER.reset(); + + // Read input graph + LOG << "Reading the input graph..."; + + START_HEAP_PROFILER("CSR Graph Allocation"); + CSRGraph graph = TIMED_SCOPE("Read csr graph") { + return io::metis::csr_read(graph_filename); + }; + STOP_HEAP_PROFILER(); + + START_HEAP_PROFILER("Compressed Graph Allocation"); + CompressedGraph compressed_graph = TIMED_SCOPE("Read compressed graph") { + return *io::metis::compress_read(graph_filename); + }; + STOP_HEAP_PROFILER(); + + // Capture graph statistics + std::size_t csr_size = graph.raw_nodes().size() * sizeof(Graph::EdgeID) + + graph.raw_edges().size() * sizeof(Graph::NodeID); + std::size_t compressed_size = compressed_graph.used_memory(); + std::size_t high_degree_count = compressed_graph.high_degree_count(); + std::size_t part_count = compressed_graph.part_count(); + std::size_t interval_count = compressed_graph.interval_count(); + + // Run checks and benchmarks + if (enable_checks) { + run_checks(graph, compressed_graph); + } + + if (enable_benchmarks) { + run_benchmark(std::move(graph), std::move(compressed_graph)); + } + + STOP_TIMER(); + DISABLE_HEAP_PROFILER(); + + // Print the result summary + LOG; + cio::print_delimiter("Result Summary"); + + LOG << "Input graph has " << graph.n() << " vertices and " << graph.m() + << " edges. Its density is " << ((graph.m()) / (float)(graph.n() * (graph.n() - 1))) << "."; + LOG << "Node weights: " << (graph.node_weighted() ? "yes" : "no") + << ", edge weights: " << (graph.edge_weighted() ? "yes" : "no"); + LOG; + + LOG << "The uncompressed graph uses " << to_megabytes(csr_size) << " mb (" << csr_size + << " bytes)."; + LOG << "The compressed graph uses " << to_megabytes(compressed_size) << " mb (" << compressed_size + << " bytes)."; + float compression_factor = csr_size / (float)compressed_size; + LOG << "Thats a compression ratio of " << compression_factor << '.'; + LOG; + + LOG << high_degree_count << " (" << (high_degree_count / (float)graph.n()) + << "%) vertices have high degree."; + LOG << part_count << " parts result from splitting the neighborhood of high degree nodes."; + LOG << interval_count << " vertices/parts use interval encoding."; + LOG; + + Timer::global().print_human_readable(std::cout); + LOG; + PRINT_HEAP_PROFILE(std::cout); + + return 0; +} diff --git a/apps/benchmarks/shm_io.h b/apps/benchmarks/shm_io.h index 723f2841..8c90dead 100644 --- a/apps/benchmarks/shm_io.h +++ b/apps/benchmarks/shm_io.h @@ -55,13 +55,13 @@ inline GraphWrapper load_graph(const std::string &graph_name, const bool is_sort wrapper.adjncy = kagen_graph.TakeAdjncy(); wrapper.vwgt = kagen_graph.TakeVertexWeights(); wrapper.adjvwgt = kagen_graph.TakeEdgeWeights(); - wrapper.graph = std::make_unique( + wrapper.graph = std::make_unique(std::make_unique( StaticArray(wrapper.xadj.data(), wrapper.xadj.size()), StaticArray(wrapper.adjncy.data(), wrapper.adjncy.size()), StaticArray(wrapper.vwgt.data(), wrapper.vwgt.size()), StaticArray(wrapper.adjvwgt.data(), wrapper.adjvwgt.size()), is_sorted - ); + )); std::cout << "Loaded graph with n=" << wrapper.graph->n() << ", m=" << wrapper.graph->m() << std::endl; diff --git a/apps/benchmarks/shm_label_propagation_benchmark.cc b/apps/benchmarks/shm_label_propagation_benchmark.cc new file mode 100644 index 00000000..ac8d2481 --- /dev/null +++ b/apps/benchmarks/shm_label_propagation_benchmark.cc @@ -0,0 +1,115 @@ +/******************************************************************************* + * Generic label propagation benchmark for the shared-memory algorithm. + * + * @file: shm_label_propagation_benchmark.cc + * @author: Daniel Salwasser + * @date: 13.12.2023 + ******************************************************************************/ +// clang-format off +#include +// clang-format on + +#include + +#include "kaminpar-shm/coarsening/lp_clustering.h" +#include "kaminpar-shm/context_io.h" +#include "kaminpar-shm/graphutils/permutator.h" +#include "kaminpar-shm/partition_utils.h" + +#include "kaminpar-common/console_io.h" +#include "kaminpar-common/logger.h" +#include "kaminpar-common/random.h" +#include "kaminpar-common/timer.h" + +#include "apps/io/shm_io.h" + +using namespace kaminpar; +using namespace kaminpar::shm; + +int main(int argc, char *argv[]) { + // Create context + Context ctx = create_default_context(); + + // Parse CLI arguments + std::string graph_filename; + int seed = 0; + + CLI::App app("Shared-memory LP benchmark"); + app.add_option("-G,--graph", graph_filename, "Graph file")->required(); + app.add_option("-t,--threads", ctx.parallel.num_threads, "Number of threads"); + app.add_option("-s,--seed", seed, "Seed for random number generation.")->default_val(seed); + app.add_option("-k,--k", ctx.partition.k, "Number of blocks in the partition.")->required(); + app.add_option( + "-e,--epsilon", + ctx.partition.epsilon, + "Maximum allowed imbalance, e.g. 0.03 for 3%. Must be strictly positive." + ) + ->check(CLI::NonNegativeNumber) + ->capture_default_str(); + create_lp_coarsening_options(&app, ctx); + create_partitioning_rearrangement_options(&app, ctx); + create_graph_compression_options(&app, ctx); + CLI11_PARSE(app, argc, argv); + + tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads); + Random::reseed(seed); + + Graph graph = io::read( + graph_filename, + io::GraphFileFormat::METIS, + ctx.compression.enabled, + ctx.compression.may_dismiss, + ctx.node_ordering == NodeOrdering::IMPLICIT_DEGREE_BUCKETS, + false + ); + ctx.setup(graph); + + const double original_epsilon = ctx.partition.epsilon; + if (ctx.node_ordering == NodeOrdering::DEGREE_BUCKETS) { + CSRGraph &csr_graph = *dynamic_cast(graph.underlying_graph()); + graph = graph::rearrange_by_degree_buckets(csr_graph); + } + + if (graph.sorted()) { + graph::remove_isolated_nodes(graph, ctx.partition); + } + + const NodeWeight max_cluster_weight = + compute_max_cluster_weight(ctx.coarsening, graph, ctx.partition); + + LPClustering lp_clustering(graph.n(), ctx.coarsening); + lp_clustering.set_max_cluster_weight(max_cluster_weight); + lp_clustering.set_desired_cluster_count(0); + + GLOBAL_TIMER.reset(); + + ENABLE_HEAP_PROFILER(); + START_HEAP_PROFILER("Label Propagation"); + TIMED_SCOPE("Label Propagation") { + lp_clustering.compute_clustering(graph, false); + }; + STOP_HEAP_PROFILER(); + DISABLE_HEAP_PROFILER(); + + STOP_TIMER(); + + if (graph.sorted()) { + graph::integrate_isolated_nodes(graph, original_epsilon, ctx); + } + + cio::print_delimiter("Input Summary", '#'); + std::cout << "Execution mode: " << ctx.parallel.num_threads << "\n"; + std::cout << "Seed: " << Random::get_seed() << "\n"; + cio::print_delimiter("Graph Compression", '-'); + print(ctx.compression, std::cout); + cio::print_delimiter("Coarsening", '-'); + print(ctx.coarsening, std::cout); + + cio::print_delimiter("Result Summary"); + Timer::global().print_human_readable(std::cout); + LOG; + heap_profiler::HeapProfiler::global().set_detailed_summary_options(); + PRINT_HEAP_PROFILE(std::cout); + + return 0; +} diff --git a/apps/benchmarks/shm_variable_length_codec_benchmark.cc b/apps/benchmarks/shm_variable_length_codec_benchmark.cc new file mode 100644 index 00000000..9e9db459 --- /dev/null +++ b/apps/benchmarks/shm_variable_length_codec_benchmark.cc @@ -0,0 +1,403 @@ +/******************************************************************************* + * Variable length codec benchmark for the shared-memory algorithm. + * + * @file: shm_variable_length_codec_benchmark.cc + * @author: Daniel Salwasser + * @date: 12.11.2023 + ******************************************************************************/ +#include +#include +#include +#include + +#include "kaminpar-cli/CLI11.h" + +#include "kaminpar-common/console_io.h" +#include "kaminpar-common/logger.h" +#include "kaminpar-common/timer.h" +#include "kaminpar-common/varint_codec.h" +#include "kaminpar-common/varint_run_length_codec.h" +#include "kaminpar-common/varint_stream_codec.h" + +using namespace kaminpar; + +enum class IntType { + INT_32, + INT_64 +}; + +std::unordered_map get_int_types() { + return { + {"int32", IntType::INT_32}, + {"int64", IntType::INT_64}, + }; +} + +template static inline void do_not_optimize(T value) { + asm volatile("" : "+m"(value) : : "memory"); +} + +template std::vector generate_random_values(const std::size_t count) { + std::vector random_values; + random_values.resize(count); + + std::random_device dev; + std::mt19937 rng(dev()); + std::uniform_int_distribution dist( + std::numeric_limits::min(), std::numeric_limits::max() + ); + for (std::size_t i = 0; i < count; ++i) { + random_values[i] = dist(rng); + } + + return random_values; +} + +template +std::unique_ptr +encode_values(std::string_view name, const std::size_t count, Lambda &&l) { + auto encoded_values = std::make_unique(count * varint_max_length()); + + TIMED_SCOPE(name) { + std::uint8_t *ptr = encoded_values.get(); + + for (std::size_t i = 0; i < count; ++i) { + const std::size_t bytes_written = varint_encode(l(i), ptr); + ptr += bytes_written; + } + }; + + return encoded_values; +} + +template +std::unique_ptr +encode_signed_values(std::string_view name, const std::size_t count, Lambda &&l) { + auto encoded_values = std::make_unique(count * varint_max_length()); + + TIMED_SCOPE(name) { + std::uint8_t *ptr = encoded_values.get(); + + for (std::size_t i = 0; i < count; ++i) { + const std::size_t bytes_written = signed_varint_encode(l(i), ptr); + ptr += bytes_written; + } + }; + + return encoded_values; +} + +template +std::unique_ptr +rl_encode_values(std::string_view name, const std::size_t count, Lambda &&l) { + auto encoded_values = std::make_unique(count * sizeof(Int) + count); + + TIMED_SCOPE(name) { + VarIntRunLengthEncoder encoder(encoded_values.get()); + + for (std::size_t i = 0; i < count; ++i) { + const std::size_t bytes_written = encoder.add(l(i)); + do_not_optimize(bytes_written); + } + + encoder.flush(); + }; + + return encoded_values; +} + +template +std::unique_ptr +sv_encode_values(std::string_view name, const std::size_t count, Lambda &&l) { + auto encoded_values = std::make_unique(count * sizeof(Int) + count); + + TIMED_SCOPE(name) { + VarIntStreamEncoder encoder(encoded_values.get(), count); + + for (std::size_t i = 0; i < count; ++i) { + const std::size_t bytes_written = encoder.add(l(i)); + do_not_optimize(bytes_written); + } + + encoder.flush(); + }; + + return encoded_values; +} + +template +std::tuple< + std::unique_ptr, + std::unique_ptr, + std::unique_ptr> +encode_values(const std::size_t count, const std::vector &random_values) { + SCOPED_TIMER("Encoding"); + + return std::make_tuple( + encode_values("Encoding zero values", count, [](const std::size_t i) { return 0; }), + encode_values( + "Encoding max values", + count, + [](const std::size_t i) { return std::numeric_limits::max(); } + ), + encode_values( + "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; } + ) + ); +} + +template +std::tuple< + std::unique_ptr, + std::unique_ptr, + std::unique_ptr> +encode_signed_values(const std::size_t count, const std::vector &random_values) { + SCOPED_TIMER("Encoding signed values"); + + return std::make_tuple( + encode_signed_values( + "Encoding zero values", count, [](const std::size_t i) { return 0; } + ), + encode_signed_values( + "Encoding max values", + count, + [](const std::size_t i) { return std::numeric_limits::max(); } + ), + encode_signed_values( + "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; } + ) + ); +} + +template +std::tuple< + std::unique_ptr, + std::unique_ptr, + std::unique_ptr> +rl_encode_values(const std::size_t count, const std::vector &random_values) { + SCOPED_TIMER("Encoding run-length"); + + return std::make_tuple( + rl_encode_values("Encoding zero values", count, [](const std::size_t i) { return 0; }), + rl_encode_values( + "Encoding max values", + count, + [](const std::size_t i) { return std::numeric_limits::max(); } + ), + rl_encode_values( + "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; } + ) + ); +} + +template +std::tuple< + std::unique_ptr, + std::unique_ptr, + std::unique_ptr> +sv_encode_values(const std::size_t count, const std::vector &random_values) { + SCOPED_TIMER("Encoding stream"); + + return std::make_tuple( + sv_encode_values("Encoding zero values", count, [](const std::size_t i) { return 0; }), + sv_encode_values( + "Encoding max values", + count, + [](const std::size_t i) { return std::numeric_limits::max(); } + ), + sv_encode_values( + "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; } + ) + ); +} + +template +void benchmark( + std::string_view name, const std::size_t count, const std::uint8_t *values_ptr, Lambda &&l +) { + SCOPED_TIMER(name); + + for (std::size_t i = 0; i < count; ++i) { + const auto [value, bytes_decoded] = l(values_ptr); + values_ptr += bytes_decoded; + + do_not_optimize(value); + } +} + +template +void benchmark_rle(std::string_view name, const std::size_t count, const std::uint8_t *values_ptr) { + SCOPED_TIMER(name); + + VarIntRunLengthDecoder decoder(values_ptr); + decoder.decode(count, [](const Int value) { do_not_optimize(value); }); +} + +template +void benchmark_sve(std::string_view name, const std::size_t count, const std::uint8_t *values_ptr) { + SCOPED_TIMER(name); + + VarIntStreamDecoder decoder(values_ptr, count); + decoder.decode(count, [](const Int value) { do_not_optimize(value); }); +} + +template +void benchmark( + std::string_view name, + const std::size_t count, + const std::uint8_t *zero_values_ptr, + const std::uint8_t *max_values_ptr, + const std::uint8_t *random_values_ptr, + Lambda &&l +) { + SCOPED_TIMER(name); + + benchmark("Decoding zero", count, zero_values_ptr, std::forward(l)); + benchmark("Decoding max values", count, max_values_ptr, std::forward(l)); + benchmark("Decoding random values", count, random_values_ptr, std::forward(l)); +} + +template +void benchmark_rle( + std::string_view name, + const std::size_t count, + const std::uint8_t *zero_values_ptr, + const std::uint8_t *max_values_ptr, + const std::uint8_t *random_values_ptr +) { + SCOPED_TIMER(name); + + benchmark_rle("Decoding zero values", count, zero_values_ptr); + benchmark_rle("Decoding max values", count, max_values_ptr); + benchmark_rle("Decoding random values", count, random_values_ptr); +} + +template +void benchmark_sve( + std::string_view name, + const std::size_t count, + const std::uint8_t *zero_values_ptr, + const std::uint8_t *max_values_ptr, + const std::uint8_t *random_values_ptr +) { + SCOPED_TIMER(name); + + benchmark_sve("Decoding zero values", count, zero_values_ptr); + benchmark_sve("Decoding max values", count, max_values_ptr); + benchmark_sve("Decoding random values", count, random_values_ptr); +} + +template void run_benchmark(std::size_t count) { + std::vector random_values = generate_random_values(count); + + const auto [encoded_zero_values, encoded_max_values, encoded_random_values] = + encode_values(count, random_values); + + benchmark( + "Decoding: loop", + count, + encoded_zero_values.get(), + encoded_max_values.get(), + encoded_random_values.get(), + [](const std::uint8_t *ptr) { return varint_decode_general(ptr); } + ); + + benchmark( + "Decoding: unrolled + intrinsic", + count, + encoded_zero_values.get(), + encoded_max_values.get(), + encoded_random_values.get(), + [](const std::uint8_t *ptr) { return varint_decode(ptr); } + ); + + std::vector> random_signed_values = + generate_random_values>(count); + + const auto [encoded_zero_signed_values, encoded_max_signed_values, encoded_random_signed_values] = + encode_signed_values>(count, random_signed_values); + + benchmark( + "Decoding signed: loop", + count, + encoded_zero_signed_values.get(), + encoded_max_signed_values.get(), + encoded_random_signed_values.get(), + [](const std::uint8_t *ptr) { + return signed_varint_decode_general>(ptr); + } + ); + + benchmark( + "Decoding signed: unrolled + intrinsic", + count, + encoded_zero_signed_values.get(), + encoded_max_signed_values.get(), + encoded_random_signed_values.get(), + [](const std::uint8_t *ptr) { return signed_varint_decode>(ptr); } + ); + + const auto [rl_encoded_zero_values, rl_encoded_max_values, rl_encoded_random_values] = + rl_encode_values(count, random_values); + + benchmark_rle( + "Decoding run-length", + count, + rl_encoded_zero_values.get(), + rl_encoded_max_values.get(), + rl_encoded_random_values.get() + ); + + if constexpr (sizeof(Int) == 4) { + const auto [sv_encoded_zero_values, sv_encoded_max_values, sv_encoded_random_values] = + sv_encode_values(count, random_values); + + benchmark_sve( + "Decoding stream", + count, + sv_encoded_zero_values.get(), + sv_encoded_max_values.get(), + sv_encoded_random_values.get() + ); + } +} + +int main(int argc, char *argv[]) { + // Parse CLI arguments + IntType int_type = IntType::INT_32; + std::size_t count = 100000000; + + CLI::App app("Shared-memory variable length codec benchmark"); + app.add_option("-n", count, "The amount of numbers to encode and decode") + ->check(CLI::NonNegativeNumber) + ->default_val(count); + app.add_option("-i,--int", int_type) + ->transform(CLI::CheckedTransformer(get_int_types()).description("")) + ->description(R"(Select a int type. The options are: + - int32 + - int64 + )"); + CLI11_PARSE(app, argc, argv); + + // Run Benchmark + LOG << "Running the benchmark..."; + GLOBAL_TIMER.reset(); + + switch (int_type) { + case IntType::INT_32: + run_benchmark(count); + break; + case IntType::INT_64: + run_benchmark(count); + break; + }; + + STOP_TIMER(); + + // Print the result summary + LOG; + cio::print_delimiter("Result Summary"); + LOG << "Encoded and decoded " << count << " integers."; + LOG; + Timer::global().print_human_readable(std::cout); +} diff --git a/apps/io/parhip_parser.cc b/apps/io/parhip_parser.cc new file mode 100644 index 00000000..06ad71e9 --- /dev/null +++ b/apps/io/parhip_parser.cc @@ -0,0 +1,239 @@ +/******************************************************************************* + * Sequential ParHiP parser. + * + * @file: parhip_parser.cc + * @author: Daniel Salwasser + * @date: 15.02.2024 + ******************************************************************************/ +#include "apps/io/parhip_parser.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "kaminpar-common/logger.h" + +namespace kaminpar::shm::io::parhip { + +constexpr std::uint64_t kParhipHeaderSize = 3 * sizeof(std::uint64_t); + +struct ParhipHeader { + bool has_edge_weights; + bool has_node_weights; + bool has_64_bit_edge_id; + bool has_64_bit_node_id; + bool has_64_bit_node_weight; + bool has_64_bit_edge_weight; + std::uint64_t num_nodes; + std::uint64_t num_edges; +}; + +ParhipHeader parse_header(std::array header) { + const std::uint64_t version = header[0]; + return { + (version & 1) == 0, + (version & 2) == 0, + (version & 4) == 0, + (version & 8) == 0, + (version & 16) == 0, + (version & 32) == 0, + header[1], + header[2] + }; +} + +void validate_ids(ParhipHeader header) { + if (header.has_64_bit_edge_id) { + if (sizeof(EdgeID) != 8) { + LOG_ERROR << "The stored graph uses 64-Bit EdgeIDs but this build uses " + << (sizeof(EdgeID) * 8) << "-Bit EdgeIDs."; + std::exit(1); + } + } else if (sizeof(EdgeID) != 4) { + LOG_ERROR << "The stored graph uses 32-Bit EdgeIDs but this build uses " << (sizeof(EdgeID) * 8) + << "-Bit EdgeIDs."; + std::exit(1); + } + + if (header.has_64_bit_node_id) { + if (sizeof(NodeID) != 8) { + LOG_ERROR << "The stored graph uses 64-Bit NodeIDs but this build uses " + << (sizeof(NodeID) * 8) << "-Bit NodeIDs."; + std::exit(1); + } + } else if (sizeof(NodeID) != 4) { + LOG_ERROR << "The stored graph uses 32-Bit EdgeIDs but this build uses " << (sizeof(NodeID) * 8) + << "-Bit NodeIDs."; + std::exit(1); + } + + if (header.has_64_bit_node_weight) { + if (sizeof(NodeWeight) != 8) { + LOG_ERROR << "The stored graph uses 64-Bit node node weights but this build uses " + << (sizeof(NodeWeight) * 8) << "-Bit node weights."; + std::exit(1); + } + } else if (sizeof(NodeWeight) != 4) { + LOG_ERROR << "The stored graph uses 32-Bit node weights but this build uses " + << (sizeof(NodeWeight) * 8) << "-Bit node weights."; + std::exit(1); + } + + if (header.has_64_bit_edge_weight) { + if (sizeof(EdgeWeight) != 8) { + LOG_ERROR << "The stored graph uses 64-Bit node edge weights but this build uses " + << (sizeof(EdgeWeight) * 8) << "-Bit edge weights."; + std::exit(1); + } + } else if (sizeof(NodeWeight) != 4) { + LOG_ERROR << "The stored graph uses 32-Bit edge weights but this build uses " + << (sizeof(EdgeWeight) * 8) << "-Bit edge weights."; + std::exit(1); + } +} + +CSRGraph read_graph( + std::ifstream &in, + const std::uint64_t n, + const std::uint64_t m, + const bool weighted_nodes, + const bool weighted_edges, + const bool sorted +) { + StaticArray nodes(n + 1); + in.read(reinterpret_cast(nodes.data()), (n + 1) * sizeof(EdgeID)); + + const EdgeID nodes_offset = kParhipHeaderSize + (n + 1) * sizeof(EdgeID); + tbb::parallel_for(tbb::blocked_range(0, n + 1), [&](const auto &r) { + for (NodeID u = r.begin(); u != r.end(); ++u) { + nodes[u] = (nodes[u] - nodes_offset) / sizeof(NodeID); + } + }); + + StaticArray edges(m); + in.read(reinterpret_cast(edges.data()), m * sizeof(NodeID)); + + StaticArray node_weights; + if (weighted_nodes) { + node_weights.resize(n); + in.read(reinterpret_cast(node_weights.data()), n * sizeof(NodeWeight)); + } + + StaticArray edge_weights; + if (weighted_edges) { + edge_weights.resize(m); + in.read(reinterpret_cast(edge_weights.data()), m * sizeof(EdgeWeight)); + } + + CSRGraph graph = CSRGraph( + std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), sorted + ); + + return graph; +} + +CSRGraph csr_read(const std::string &filename, const bool sorted) { + std::ifstream in(filename, std::ios::binary); + if (!in.is_open()) { + LOG_ERROR << "Cannot read graph stored at " << filename << "."; + std::exit(1); + } + + std::array raw_header; + in.read(reinterpret_cast(raw_header.data()), kParhipHeaderSize); + + ParhipHeader header = parse_header(raw_header); + validate_ids(header); + + return read_graph( + in, + header.num_nodes, + header.num_edges, + header.has_node_weights, + header.has_edge_weights, + sorted + ); +} + +CompressedGraph compressed_read(const std::string &filename, const bool sorted) { + const int file = open(filename.c_str(), O_RDONLY); + if (file < 0) { + LOG_ERROR << "Cannot read graph stored at " << filename << "."; + std::exit(1); + } + + struct stat file_info {}; + if (fstat(file, &file_info) < 0) { + LOG_ERROR << "Cannot read graph stored at " << filename << "."; + close(file); + std::exit(1); + } + + const std::size_t length = static_cast(file_info.st_size); + + std::uint8_t *data = + static_cast(mmap(nullptr, length, PROT_READ, MAP_PRIVATE, file, 0)); + if (data == MAP_FAILED) { + LOG_ERROR << "Cannot read graph stored at " << filename << "."; + close(file); + std::exit(1); + } + + std::array raw_header; + std::memcpy(raw_header.data(), data, kParhipHeaderSize); + data += kParhipHeaderSize; + + const ParhipHeader header = parse_header(raw_header); + validate_ids(header); + + CompressedGraphBuilder builder; + builder.init( + header.num_nodes, header.num_edges, header.has_node_weights, header.has_edge_weights, sorted + ); + + const EdgeID *nodes = reinterpret_cast(data); + data += (header.num_nodes + 1) * sizeof(EdgeID); + + const NodeID *edges = reinterpret_cast(data); + data += header.num_edges + sizeof(NodeID); + + const NodeWeight *node_weights = reinterpret_cast(data); + data += header.num_nodes + sizeof(NodeWeight); + + const EdgeWeight *edge_weights = reinterpret_cast(data); + + const EdgeID nodes_offset = kParhipHeaderSize + (header.num_nodes + 1) * sizeof(EdgeID); + std::vector> neighbourhood; + for (NodeID u = 0; u < header.num_nodes; ++u) { + const EdgeID offset = (nodes[u] - nodes_offset) / sizeof(NodeID); + const EdgeID next_offset = (nodes[u + 1] - nodes_offset) / sizeof(NodeID); + + const NodeID degree = static_cast(next_offset - offset); + for (NodeID i = 0; i < degree; ++i) { + const EdgeID e = offset + i; + + const NodeID adjacent_node = edges[e]; + const EdgeWeight edge_weight = header.has_edge_weights ? edge_weights[e] : 1; + + neighbourhood.push_back(std::make_pair(adjacent_node, edge_weight)); + } + + builder.add_node(u, neighbourhood); + if (header.has_node_weights) { + builder.set_node_weight(u, node_weights[u]); + } + + neighbourhood.clear(); + } + + munmap(data, length); + close(file); + return builder.build(); +} + +} // namespace kaminpar::shm::io::parhip diff --git a/apps/io/parhip_parser.h b/apps/io/parhip_parser.h new file mode 100644 index 00000000..91a46eff --- /dev/null +++ b/apps/io/parhip_parser.h @@ -0,0 +1,21 @@ +/******************************************************************************* + * Sequential ParHiP parser. + * + * @file: parhip_parser.h + * @author: Daniel Salwasser + * @date: 15.02.2024 + ******************************************************************************/ +#pragma once + +#include + +#include "kaminpar-shm/datastructures/compressed_graph.h" +#include "kaminpar-shm/datastructures/csr_graph.h" + +namespace kaminpar::shm::io::parhip { + +CSRGraph csr_read(const std::string &filename, const bool sorted); + +CompressedGraph compressed_read(const std::string &filename, const bool sorted); + +} // namespace kaminpar::shm::io::parhip diff --git a/apps/io/shm_compressed_graph_binary.cc b/apps/io/shm_compressed_graph_binary.cc new file mode 100644 index 00000000..b1c3ad23 --- /dev/null +++ b/apps/io/shm_compressed_graph_binary.cc @@ -0,0 +1,268 @@ +/******************************************************************************* + * IO utilities for the compressed graph binary. + * + * @file: shm_compressed_graph_binary.cc + * @author: Daniel Salwasser + * @date: 12.12.2023 + ******************************************************************************/ +#include "apps/io/shm_compressed_graph_binary.h" + +#include +#include + +#include "kaminpar-common/logger.h" + +namespace kaminpar::shm::io::compressed_binary { + +template static void write_int(std::ofstream &out, const T id) { + out.write(reinterpret_cast(&id), sizeof(T)); +} + +template +static void write_compact_static_array(std::ofstream &out, const CompactStaticArray &array) { + write_int(out, array.byte_width()); + write_int(out, array.allocated_size()); + out.write(reinterpret_cast(array.data()), array.allocated_size()); +} + +template +static void write_static_array(std::ofstream &out, const StaticArray &static_array) { + out.write(reinterpret_cast(static_array.data()), static_array.size() * sizeof(T)); +} + +void write(const std::string &filename, const CompressedGraph &graph) { + std::ofstream out(filename, std::ios::binary); + + write_int(out, kMagicNumber); + + write_int(out, static_cast(sizeof(CompressedGraph::NodeID))); + write_int(out, static_cast(sizeof(CompressedGraph::EdgeID))); + write_int(out, static_cast(sizeof(CompressedGraph::NodeWeight))); + write_int(out, static_cast(sizeof(CompressedGraph::EdgeWeight))); + + write_int(out, static_cast(CompressedGraph::kHighDegreeEncoding)); + write_int(out, CompressedGraph::kHighDegreeThreshold); + write_int(out, CompressedGraph::kHighDegreePartLength); + write_int(out, static_cast(CompressedGraph::kIntervalEncoding)); + write_int(out, CompressedGraph::kIntervalLengthTreshold); + write_int(out, static_cast(CompressedGraph::kRunLengthEncoding)); + write_int(out, static_cast(CompressedGraph::kStreamEncoding)); + write_int(out, static_cast(CompressedGraph::kIsolatedNodesSeparation)); + + write_int(out, graph.n()); + write_int(out, graph.m()); + write_int(out, graph.max_degree()); + write_int(out, static_cast(graph.sorted())); + write_int(out, static_cast(graph.node_weighted())); + write_int(out, static_cast(graph.edge_weighted())); + + write_int(out, graph.high_degree_count()); + write_int(out, graph.part_count()); + write_int(out, graph.interval_count()); + + write_compact_static_array(out, graph.raw_nodes()); + + write_int(out, graph.raw_compressed_edges().size()); + write_static_array(out, graph.raw_compressed_edges()); + + if (graph.node_weighted()) { + write_static_array(out, graph.raw_node_weights()); + } + + if (graph.edge_weighted()) { + write_static_array(out, graph.raw_edge_weights()); + } +} + +template static T read_int(std::ifstream &in) { + T t; + in.read(reinterpret_cast(&t), sizeof(T)); + return t; +} + +template static CompactStaticArray read_compact_static_array(std::ifstream &in) { + std::uint8_t byte_width = read_int(in); + std::size_t allocated_size = read_int(in); + + auto data = std::make_unique(allocated_size); + in.read(reinterpret_cast(data.get()), allocated_size); + return CompactStaticArray(byte_width, allocated_size, std::move(data)); +} + +template +static StaticArray read_static_array(std::ifstream &in, const std::size_t size) { + T *ptr = static_cast(std::malloc(sizeof(T) * size)); + in.read(reinterpret_cast(ptr), sizeof(T) * size); + return StaticArray(ptr, size); +} + +CompressedGraph read(const std::string &filename) { + using NodeID = CompressedGraph::NodeID; + using EdgeID = CompressedGraph::EdgeID; + using NodeWeight = CompressedGraph::NodeWeight; + using EdgeWeight = CompressedGraph::EdgeWeight; + + std::ifstream in(filename, std::ios::binary); + + if (kMagicNumber != read_int(in)) { + LOG_ERROR << "The magic number of the file is not correct!"; + std::exit(1); + } + + std::uint8_t stored_node_id_size = read_int(in); + if (stored_node_id_size != sizeof(NodeID)) { + LOG_ERROR << "The stored compressed graph uses " << (stored_node_id_size * 8) + << "-Bit NodeIDs but this build uses " << (sizeof(NodeID) * 8) << "-Bit NodeIDs."; + std::exit(1); + } + + std::uint8_t stored_edge_id_size = read_int(in); + if (stored_edge_id_size != sizeof(EdgeID)) { + LOG_ERROR << "The stored compressed graph uses " << (stored_edge_id_size * 8) + << "-Bit EdgeIDs but this build uses " << (sizeof(EdgeID) * 8) << "-Bit EdgeIDs."; + std::exit(1); + } + + std::uint8_t stored_node_weight_size = read_int(in); + if (stored_node_weight_size != sizeof(NodeWeight)) { + LOG_ERROR << "The stored compressed graph uses " << (stored_node_weight_size * 8) + << "-Bit NodeWeights but this build uses " << (sizeof(NodeWeight) * 8) + << "-Bit NodeWeights."; + std::exit(1); + } + + std::uint8_t stored_edge_weight_size = read_int(in); + if (stored_edge_weight_size != sizeof(EdgeWeight)) { + LOG_ERROR << "The stored compressed graph uses " << (stored_edge_weight_size * 8) + << "-Bit EdgeWeights but this build uses " << (sizeof(EdgeWeight) * 8) + << "-Bit EdgeWeights."; + std::exit(1); + } + + bool high_degree_encoding = static_cast(read_int(in)); + if (high_degree_encoding != CompressedGraph::kHighDegreeEncoding) { + if (high_degree_encoding) { + LOG_ERROR << "The stored compressed graph uses high degree encoding but this build does not."; + } else { + LOG_ERROR + << "The stored compressed graph does not use high degree encoding but this build does."; + } + std::exit(1); + } + + NodeID high_degree_threshold = read_int(in); + if (high_degree_threshold != CompressedGraph::kHighDegreeThreshold) { + LOG_ERROR << "The stored compressed graph uses " << high_degree_threshold + << " as the high degree threshold but this build uses " + << (CompressedGraph::kHighDegreeThreshold) << " as the high degree threshold."; + std::exit(1); + } + + NodeID high_degree_part_length = read_int(in); + if (high_degree_part_length != CompressedGraph::kHighDegreePartLength) { + LOG_ERROR << "The stored compressed graph uses " << high_degree_part_length + << " as the high degree part length but this build uses " + << (CompressedGraph::kHighDegreePartLength) << " as the high degree part length."; + std::exit(1); + } + + bool interval_encoding = static_cast(read_int(in)); + if (interval_encoding != CompressedGraph::kIntervalEncoding) { + if (interval_encoding) { + LOG_ERROR << "The stored compressed graph uses interval encoding but this build does not."; + } else { + LOG_ERROR + << "The stored compressed graph does not use interval encoding but this build does."; + } + std::exit(1); + } + + NodeID interval_length_threshold = read_int(in); + if (interval_length_threshold != CompressedGraph::kIntervalLengthTreshold) { + LOG_ERROR << "The stored compressed graph uses " << interval_length_threshold + << " as the interval length threshold but this build uses " + << (CompressedGraph::kIntervalLengthTreshold) << " as the interval length threshold."; + std::exit(1); + } + + bool run_length_encoding = static_cast(read_int(in)); + if (run_length_encoding != CompressedGraph::kRunLengthEncoding) { + if (run_length_encoding) { + LOG_ERROR << "The stored compressed graph uses run-length encoding but this build does not."; + } else { + LOG_ERROR + << "The stored compressed graph does not use run-length encoding but this build does."; + } + std::exit(1); + } + + bool stream_encoding = static_cast(read_int(in)); + if (stream_encoding != CompressedGraph::kStreamEncoding) { + if (stream_encoding) { + LOG_ERROR << "The stored compressed graph uses stream encoding but this build does not."; + } else { + LOG_ERROR << "The stored compressed graph does not use stream encoding but this build does."; + } + std::exit(1); + } + + bool isolated_nodes_separation = static_cast(read_int(in)); + if (isolated_nodes_separation != CompressedGraph::kIsolatedNodesSeparation) { + if (isolated_nodes_separation) { + LOG_ERROR + << "The stored compressed graph uses isolated nodes separation but this build does not."; + } else { + LOG_ERROR << "The stored compressed graph does not use isolated nodes separation but this " + "build does."; + } + std::exit(1); + } + + NodeID n = read_int(in); + EdgeID m = read_int(in); + NodeID max_degree = read_int(in); + bool sorted = static_cast(read_int(in)); + bool is_node_weighted = static_cast(read_int(in)); + bool is_edge_weighted = static_cast(read_int(in)); + + std::size_t high_degree_count = read_int(in); + std::size_t part_count = read_int(in); + std::size_t interval_count = read_int(in); + + CompactStaticArray nodes = read_compact_static_array(in); + + std::size_t compressed_edges_size = read_int(in); + StaticArray compressed_edges = + read_static_array(in, compressed_edges_size); + StaticArray node_weights = + + is_node_weighted ? read_static_array(in, n) : StaticArray(); + + StaticArray edge_weights = + is_edge_weighted ? read_static_array(in, m) : StaticArray(); + + return CompressedGraph( + std::move(nodes), + std::move(compressed_edges), + std::move(node_weights), + std::move(edge_weights), + m, + max_degree, + sorted, + high_degree_count, + part_count, + interval_count + ); +} + +bool is_compressed(const std::string &filename) { + const auto size = std::filesystem::file_size(filename); + if (size < sizeof(kMagicNumber)) { + return false; + } + + std::ifstream in(filename, std::ios::binary); + return kMagicNumber == read_int(in); +} + +} // namespace kaminpar::shm::io::compressed_binary diff --git a/apps/io/shm_compressed_graph_binary.h b/apps/io/shm_compressed_graph_binary.h new file mode 100644 index 00000000..0362e3d2 --- /dev/null +++ b/apps/io/shm_compressed_graph_binary.h @@ -0,0 +1,44 @@ +/******************************************************************************* + * IO utilities for the compressed graph binary. + * + * @file: shm_compressed_graph_binary.h + * @author: Daniel Salwasser + * @date: 12.12.2023 + ******************************************************************************/ +#pragma once + +#include + +#include "kaminpar-shm/datastructures/compressed_graph.h" + +namespace kaminpar::shm::io::compressed_binary { + +//! Magic number to identify a compressed graph binary file. +constexpr std::uint64_t kMagicNumber = 0x434F4D5052455353; + +/*! + * Writes a graph to a file as a compressed graph binary. + * + * @param filename The name of the file to write to. + * @param graph The compressed graph to write. + */ +void write(const std::string &filename, const CompressedGraph &graph); + +/*! + * Reads the graph from a compressed graph binary file. If the paramters of the compressed graph + * stored in the file do not match with this build, exit is called. + * + * @param filename The name of the file to read from. + * @return The read compressed graph. + */ +CompressedGraph read(const std::string &filename); + +/*! + * Checks whether a graph is stored in compressed format. + * + * @param filename The name of the file to check. + * @return Whether the graph is stored in compressed format. + */ +bool is_compressed(const std::string &filename); + +} // namespace kaminpar::shm::io::compressed_binary diff --git a/apps/io/shm_io.cc b/apps/io/shm_io.cc index 95b392a1..36b9f251 100644 --- a/apps/io/shm_io.cc +++ b/apps/io/shm_io.cc @@ -9,30 +9,141 @@ #include +#include "kaminpar-shm/datastructures/graph.h" #include "kaminpar-shm/kaminpar.h" #include "kaminpar-common/datastructures/static_array.h" #include "kaminpar-common/logger.h" #include "apps/io/metis_parser.h" +#include "apps/io/parhip_parser.h" +#include "apps/io/shm_compressed_graph_binary.h" +#include "apps/io/shm_input_validator.h" namespace kaminpar::shm::io { // // Public Metis functions // namespace metis { + +template void check_format(kaminpar::io::metis::Format format) { + if constexpr (checked) { + if (format.number_of_nodes >= static_cast(std::numeric_limits::max())) { + LOG_ERROR << "number of nodes is too large for the node ID type"; + std::exit(1); + } + if (format.number_of_edges >= static_cast(std::numeric_limits::max())) { + LOG_ERROR << "number of edges is too large for the edge ID type"; + std::exit(1); + } + if (format.number_of_edges > (format.number_of_nodes * (format.number_of_nodes - 1) / 2)) { + LOG_ERROR << "specified number of edges is impossibly large"; + std::exit(1); + } + } else { + KASSERT( + format.number_of_nodes <= static_cast(std::numeric_limits::max()), + "number of nodes is too large for the node ID type" + ); + KASSERT( + format.number_of_edges <= static_cast(std::numeric_limits::max()), + "number of edges is too large for the edge ID type" + ); + KASSERT( + format.number_of_edges <= (format.number_of_nodes * (format.number_of_nodes - 1)) / 2, + "specified number of edges is impossibly large" + ); + } +} + +template void check_node_weight(const std::uint64_t weight) { + if constexpr (checked) { + if (weight > static_cast(std::numeric_limits::max())) { + LOG_ERROR << "node weight is too large for the node weight type"; + std::exit(1); + } + if (weight <= 0) { + LOG_ERROR << "zero node weights are not supported"; + std::exit(1); + } + } else { + KASSERT( + weight <= static_cast(std::numeric_limits::max()), + "node weight is too large for the node weight type" + ); + KASSERT(weight > 0u, "zero node weights are not supported"); + } +} + template -void read( - const std::string &filename, - StaticArray &nodes, - StaticArray &edges, - StaticArray &node_weights, - StaticArray &edge_weights +void check_edge( + const std::uint64_t node_count, + const std::uint64_t u, + const std::uint64_t weight, + const std::uint64_t v ) { + if constexpr (checked) { + if (weight > static_cast(std::numeric_limits::max())) { + LOG_ERROR << "edge weight is too large for the edge weight type"; + std::exit(1); + } + if (weight <= 0) { + LOG_ERROR << "zero edge weights are not supported"; + std::exit(1); + } + if (v + 1 >= node_count) { + LOG_ERROR << "neighbor " << v + 1 << " of nodes " << u + 1 << " is out of bounds"; + std::exit(1); + } + if (v + 1 == u) { + LOG_ERROR << "detected self-loop on node " << v + 1 << ", which is not allowed"; + std::exit(1); + } + } else { + KASSERT( + weight <= static_cast(std::numeric_limits::max()), + "edge weight is too large for the edge weight type" + ); + KASSERT(weight > 0u, "zero edge weights are not supported"); + KASSERT(v + 1 < node_count, "neighbor out of bounds"); + KASSERT(u != v + 1, "detected illegal self-loop"); + } +} + +template +void check_total_weight(std::int64_t total_node_weight, std::int64_t total_edge_weight) { + if constexpr (checked) { + if (total_node_weight > static_cast(std::numeric_limits::max())) { + LOG_ERROR << "total node weight does not fit into the node weight type"; + std::exit(1); + } + if (total_edge_weight > static_cast(std::numeric_limits::max())) { + LOG_ERROR << "total edge weight does not fit into the edge weight type"; + std::exit(1); + } + } else { + KASSERT( + total_node_weight <= static_cast(std::numeric_limits::max()), + "total node weight does not fit into the node weight type" + ); + KASSERT( + total_edge_weight <= static_cast(std::numeric_limits::max()), + "total edge weight does not fit into the edge weight type" + ); + } +} + +template CSRGraph csr_read(const std::string &filename, const bool sorted) { using namespace kaminpar::io::metis; + RECORD("nodes") StaticArray nodes; + RECORD("edges") StaticArray edges; + RECORD("node_weights") StaticArray node_weights; + RECORD("edge_weights") StaticArray edge_weights; + bool store_node_weights = false; bool store_edge_weights = false; + std::int64_t total_node_weight = 0; std::int64_t total_edge_weight = 0; @@ -42,134 +153,50 @@ void read( parse( filename, [&](const auto &format) { - if constexpr (checked) { - if (format.number_of_nodes >= - static_cast(std::numeric_limits::max())) { - LOG_ERROR << "number of nodes is too large for the node ID type"; - std::exit(1); - } - if (format.number_of_edges >= - static_cast(std::numeric_limits::max())) { - LOG_ERROR << "number of edges is too large for the edge ID type"; - std::exit(1); - } - if (format.number_of_edges > - (format.number_of_nodes * (format.number_of_nodes - 1) / 2)) { - LOG_ERROR << "specified number of edges is impossibly large"; - std::exit(1); - } - } else { - KASSERT( - format.number_of_nodes <= - static_cast(std::numeric_limits::max()), - "number of nodes is too large for the node ID type" - ); - KASSERT( - format.number_of_edges <= - static_cast(std::numeric_limits::max()), - "number of edges is too large for the edge ID type" - ); - KASSERT( - format.number_of_edges <= (format.number_of_nodes * (format.number_of_nodes - 1)) / 2, - "specified number of edges is impossibly large" - ); - } + check_format(format); store_node_weights = format.has_node_weights; store_edge_weights = format.has_edge_weights; + nodes.resize(format.number_of_nodes + 1); edges.resize(format.number_of_edges * 2); + if (store_node_weights) { node_weights.resize(format.number_of_nodes); } + if (store_edge_weights) { edge_weights.resize(format.number_of_edges * 2); } }, [&](const std::uint64_t weight) { - if constexpr (checked) { - if (weight > static_cast(std::numeric_limits::max())) { - LOG_ERROR << "node weight is too large for the node weight type"; - std::exit(1); - } - if (weight <= 0) { - LOG_ERROR << "zero node weights are not supported"; - std::exit(1); - } - } else { - KASSERT( - weight <= static_cast(std::numeric_limits::max()), - "node weight is too large for the node weight type" - ); - KASSERT(weight > 0u, "zero node weights are not supported"); - } + check_node_weight(weight); + total_node_weight += weight; if (store_node_weights) { node_weights[u] = static_cast(weight); } + nodes[u] = e; - total_node_weight += weight; - ++u; + u += 1; }, [&](const std::uint64_t weight, const std::uint64_t v) { - if constexpr (checked) { - if (weight > static_cast(std::numeric_limits::max())) { - LOG_ERROR << "edge weight is too large for the edge weight type"; - std::exit(1); - } - if (weight <= 0) { - LOG_ERROR << "zero edge weights are not supported"; - std::exit(1); - } - if (v + 1 >= nodes.size()) { - LOG_ERROR << "neighbor " << v + 1 << " of nodes " << u + 1 << " is out of bounds"; - std::exit(1); - } - if (v + 1 == u) { - LOG_ERROR << "detected self-loop on node " << v + 1 << ", which is not allowed"; - std::exit(1); - } - } else { - KASSERT( - weight <= static_cast(std::numeric_limits::max()), - "edge weight is too large for the edge weight type" - ); - KASSERT(weight > 0u, "zero edge weights are not supported"); - KASSERT(v + 1 < nodes.size(), "neighbor out of bounds"); - KASSERT(u != v + 1, "detected illegal self-loop"); - } + check_edge(nodes.size(), u, weight, v); + total_edge_weight += weight; if (store_edge_weights) { edge_weights[e] = static_cast(weight); } + edges[e] = static_cast(v); - total_edge_weight += weight; - ++e; + e += 1; } ); nodes[u] = e; - if constexpr (checked) { - if (total_node_weight > static_cast(std::numeric_limits::max())) { - LOG_ERROR << "total node weight does not fit into the node weight type"; - std::exit(1); - } - if (total_edge_weight > static_cast(std::numeric_limits::max())) { - LOG_ERROR << "total edge weight does not fit into the edge weight type"; - std::exit(1); - } - } else { - KASSERT( - total_node_weight <= static_cast(std::numeric_limits::max()), - "total node weight does not fit into the node weight type" - ); - KASSERT( - total_edge_weight <= static_cast(std::numeric_limits::max()), - "total edge weight does not fit into the edge weight type" - ); - } + check_total_weight(total_node_weight, total_edge_weight); - // only keep weights if the graph is really weighted + // Only keep weights if the graph is really weighted. const bool unit_node_weights = static_cast(total_node_weight + 1) == nodes.size(); if (unit_node_weights) { node_weights.free(); @@ -179,25 +206,208 @@ void read( if (unit_edge_weights) { edge_weights.free(); } + + return CSRGraph( + std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), sorted + ); } -template void read( - const std::string &filename, - StaticArray &nodes, - StaticArray &edges, - StaticArray &node_weights, - StaticArray &edge_weights -); +template CSRGraph csr_read(const std::string &filename, const bool sorted); +template CSRGraph csr_read(const std::string &filename, const bool sorted); + +template +std::optional +compress_read(const std::string &filename, const bool sorted, const bool may_dismiss) { + using namespace kaminpar::io::metis; + + std::uint64_t number_of_nodes; + bool store_node_weights; + bool store_edge_weights; + + std::size_t uncompressed_edge_array_size; + bool dismissed = false; + + NodeID node = 0; + EdgeID edge = 0; + + CompressedGraphBuilder builder; + RECORD("neighbourhood") std::vector> neighbourhood; + RECORD_LOCAL_DATA_STRUCT("vector>", 0, neighbourhood_stats); + + parse( + filename, + [&](const auto &format) { + check_format(format); + + const std::size_t max_size = CompressedGraphBuilder::compressed_edge_array_max_size( + format.number_of_nodes, format.number_of_edges + ); + const std::size_t node_array_diff = + (sizeof(EdgeID) - math::byte_width(max_size)) * (format.number_of_nodes + 1); + + number_of_nodes = format.number_of_nodes + 1; + uncompressed_edge_array_size = + format.number_of_edges * sizeof(NodeID) * 2 + node_array_diff; + store_node_weights = format.has_node_weights; + store_edge_weights = format.has_edge_weights; + + builder.init( + format.number_of_nodes, + format.number_of_edges, + store_node_weights, + store_edge_weights, + sorted + ); + }, + [&](const std::uint64_t weight) { + check_node_weight(weight); + + if (node > 0) { + builder.add_node(node - 1, neighbourhood); + + if (may_dismiss && builder.edge_array_size() > uncompressed_edge_array_size) { + dismissed = true; + return false; + } + + neighbourhood.clear(); + } + + if (store_node_weights) { + builder.set_node_weight(node, static_cast(weight)); + } + + node += 1; + return true; + }, + [&](const std::uint64_t weight, const std::uint64_t v) { + check_edge(number_of_nodes, node, weight, v); + + neighbourhood.push_back(std::pair(static_cast(v), static_cast(weight))); + edge += 1; + } + ); + + if (dismissed) { + return std::nullopt; + } + + builder.add_node(node - 1, neighbourhood); + + check_total_weight(builder.total_node_weight(), builder.total_edge_weight()); + IF_HEAP_PROFILING(neighbourhood_stats->size = neighbourhood.capacity() * sizeof(NodeID)); + + return builder.build(); +} + +template std::optional +compress_read(const std::string &filename, const bool sorted, const bool may_dismiss); +template std::optional +compress_read(const std::string &filename, const bool sorted, const bool may_dismiss); + +void write(const std::string &filename, const Graph &graph) { + std::ofstream out(filename); + + out << graph.n() << ' ' << (graph.m() / 2); + if (graph.node_weighted() || graph.edge_weighted()) { + out << ' '; + + if (graph.node_weighted()) { + out << '1'; + } + + out << (graph.edge_weighted() ? '1' : '0'); + } + out << '\n'; + + for (const NodeID node : graph.nodes()) { + if (graph.node_weighted()) { + out << graph.node_weight(node) << ' '; + } + + graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) { + out << (adjacent_node + 1) << ' '; + + if (graph.edge_weighted()) { + out << graph.edge_weight(incident_edge) << ' '; + } + }); + + out << '\n'; + } +} -template void read( - const std::string &filename, - StaticArray &nodes, - StaticArray &edges, - StaticArray &node_weights, - StaticArray &edge_weights -); } // namespace metis +std::unordered_map get_graph_file_formats() { + return { + {"metis", GraphFileFormat::METIS}, + {"parhip", GraphFileFormat::PARHIP}, + }; +} + +Graph read( + const std::string &filename, + const GraphFileFormat file_format, + const bool compress, + const bool may_dismiss, + const bool sorted, + const bool validate +) { + if (compressed_binary::is_compressed(filename)) { + if (!compress) { + LOG_ERROR + << "The input graph is stored in a compressed format but graph compression is disabled!"; + std::exit(1); + } + + return Graph(std::make_unique(compressed_binary::read(filename))); + } + + if (compress) { + std::optional compresed_graph = [&] { + if (validate) { + return metis::compress_read(filename, sorted, may_dismiss); + } else { + switch (file_format) { + case GraphFileFormat::METIS: + return metis::compress_read(filename, sorted, may_dismiss); + case GraphFileFormat::PARHIP: + return std::optional(parhip::compressed_read(filename, sorted)); + default: + throw std::runtime_error("unexpected graph file format"); + } + } + }(); + + if (compresed_graph) { + return Graph(std::make_unique(std::move(*compresed_graph))); + } + } + + if (validate) { + CSRGraph csr_graph = metis::csr_read(filename, sorted); + + shm::validate_undirected_graph( + csr_graph.raw_nodes(), + csr_graph.raw_edges(), + csr_graph.raw_node_weights(), + csr_graph.raw_edge_weights() + ); + + return Graph(std::make_unique(std::move(csr_graph))); + } else { + switch (file_format) { + case GraphFileFormat::METIS: + return Graph(std::make_unique(metis::csr_read(filename, sorted))); + case GraphFileFormat::PARHIP: + return Graph(std::make_unique(parhip::csr_read(filename, sorted))); + default: + throw std::runtime_error("unexpected graph file format"); + } + } +} + // // Partition // diff --git a/apps/io/shm_io.h b/apps/io/shm_io.h index cc5e4bbb..7136ce87 100644 --- a/apps/io/shm_io.h +++ b/apps/io/shm_io.h @@ -7,25 +7,91 @@ ******************************************************************************/ #pragma once +#include #include #include +#include "kaminpar-shm/datastructures/compressed_graph.h" +#include "kaminpar-shm/datastructures/csr_graph.h" +#include "kaminpar-shm/datastructures/graph.h" #include "kaminpar-shm/kaminpar.h" #include "kaminpar-common/datastructures/static_array.h" namespace kaminpar::shm::io { namespace metis { + +/** + * Reads a graph that is stored in a file in METIS format. + * + * @param filename The name of the file to read. + * @param sorted Whether the nodes of the graph to read are stored in deg-buckets order. + * @tparam checked Whether to validate the read graph. + * @return The graph in compressed sparse row format stored in the file. + */ +template CSRGraph csr_read(const std::string &filename, const bool sorted = false); + +/*! + * Reads and compresses a graph that is stored in a file in METIS format. + * + * @param filename The name of the file to read. + * @param sorted Whether the nodes of the graph to read are stored in deg-buckets order. + * @param may_dismiss Whether the reading process is aborted when the compressed graph uses more + * memory than the uncompressed graph. + * @tparam checked Whether to validate the read graph. + * @return The graph in compressed form stored in the file. + */ template -void read( - const std::string &filename, - StaticArray &nodes, - StaticArray &edges, - StaticArray &node_weights, - StaticArray &edge_weights +std::optional compress_read( + const std::string &filename, const bool sorted = false, const bool may_dismiss = false ); + +/*! + * Writes a graph to a file in METIS format. + * + * @param filename The name of the file for saving the graph. + * @param graph The graph to save. + */ +void write(const std::string &filename, const Graph &graph); + } // namespace metis +/*! + * All graph file formats that can be parsed. + */ +enum class GraphFileFormat { + METIS, + PARHIP +}; + +/*! + * Returns a table which maps identifiers to their corresponding graph file format. + * + * @return A table which maps identifiers to their corresponding graph file format. + */ +std::unordered_map get_graph_file_formats(); + +/*! + * Reads a graph that is either stored in METIS or compressed format. + * + * @param filename The name of the file to read. + * @param file_format The format of the file used to store the graph. + * @param compress Whether to compress the graph. + * @param may_dismiss Whether the compressed graph is only returned when it uses less memory than + * the uncompressed graph. + * @param sorted Whether the nodes of the graph to read are stored in deg-buckets order. + * @param validate Whether to validate the graph. + * @return The graph to read. + */ +Graph read( + const std::string &filename, + const GraphFileFormat file_format, + const bool compress, + const bool may_dismiss, + const bool sorted, + const bool validate +); + namespace partition { std::vector read(const std::string &filename); void write(const std::string &filename, const std::vector &partition); diff --git a/apps/tools/CMakeLists.txt b/apps/tools/CMakeLists.txt new file mode 100644 index 00000000..08f0c5ab --- /dev/null +++ b/apps/tools/CMakeLists.txt @@ -0,0 +1,10 @@ +function(add_shm_tool target) + add_executable(${target} ${ARGN}) + target_link_libraries(${target} PRIVATE KaMinPar::KaMinPar KaMinPar::KaMinParIO) + message(STATUS "Enabled tool: ${target}") +endfunction() + +# Shared-memory tools +add_shm_tool(shm_graph_compression_tool shm_graph_compression_tool.cc) +add_shm_tool(shm_graph_properties_tool shm_graph_properties_tool.cc) +add_shm_tool(shm_graph_rearrangement_tool shm_graph_rearrangement_tool.cc) diff --git a/apps/tools/shm_graph_compression_tool.cc b/apps/tools/shm_graph_compression_tool.cc new file mode 100644 index 00000000..7db47575 --- /dev/null +++ b/apps/tools/shm_graph_compression_tool.cc @@ -0,0 +1,44 @@ +/******************************************************************************* + * Graph compression tool for the shared-memory algorithm. + * + * @file: shm_graph_compression_tool.cc + * @author: Daniel Salwasser + * @date: 14.12.2023 + ******************************************************************************/ +// clang-format off +#include +// clang-format on + +#include + +#include "kaminpar-common/logger.h" + +#include "apps/io/shm_compressed_graph_binary.h" +#include "apps/io/shm_io.h" + +using namespace kaminpar; +using namespace kaminpar::shm; + +int main(int argc, char *argv[]) { + // Parse CLI arguments + std::string graph_filename; + std::string compressed_graph_filename; + int num_threads = 1; + + CLI::App app("Shared-memory graph compression tool"); + app.add_option("-G,--graph", graph_filename, "Input graph in METIS format")->required(); + app.add_option("--out", compressed_graph_filename, "Ouput file for saving the compressed graph") + ->required(); + app.add_option("-t,--threads", num_threads, "Number of threads"); + CLI11_PARSE(app, argc, argv); + + tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads); + + LOG << "Reading input graph..."; + CompressedGraph graph = *io::metis::compress_read(graph_filename); + + LOG << "Writing compressed graph..."; + io::compressed_binary::write(compressed_graph_filename, graph); + + return 0; +} diff --git a/apps/tools/shm_graph_properties_tool.cc b/apps/tools/shm_graph_properties_tool.cc new file mode 100644 index 00000000..35c2e82a --- /dev/null +++ b/apps/tools/shm_graph_properties_tool.cc @@ -0,0 +1,110 @@ +/******************************************************************************* + * Graph properties tool for the shared-memory algorithm. + * + * @file: shm_graph_properties_tool.cc + * @author: Daniel Salwasser + * @date: 26.12.2023 + ******************************************************************************/ +// clang-format off +#include +// clang-format on + +#include + +#include "kaminpar-shm/context_io.h" + +#include "kaminpar-common/console_io.h" +#include "kaminpar-common/logger.h" +#include "kaminpar-common/strutils.h" + +#include "apps/io/shm_io.h" + +using namespace kaminpar; +using namespace kaminpar::shm; + +float average_degree(const Graph &graph) { + std::size_t average_degree = 0; + + for (const NodeID node : graph.nodes()) { + average_degree += graph.degree(node); + } + + return average_degree / (float)graph.n(); +} + +NodeID isolated_nodes(const Graph &graph) { + NodeID count = 0; + + for (const NodeID node : graph.nodes()) { + if (graph.degree(node) == 0) { + count++; + } + } + + return count; +} + +void print_graph_properties(const Graph &graph, const Context ctx, std::ostream &out) { + const float avg_deg = average_degree(graph); + const NodeID isolated_node_count = isolated_nodes(graph); + const std::size_t width = std::ceil(std::log10( + std::max({graph.n(), graph.m(), graph.max_degree(), isolated_node_count}) + )); + + cio::print_delimiter("Graph Properties", '#'); + out << "Graph: " << ctx.debug.graph_name << "\n"; + out << " Number of nodes: " << std::setw(width) << graph.n(); + if (graph.node_weighted()) { + out << " (total weight: " << graph.total_node_weight() << ")\n"; + } else { + out << " (unweighted)\n"; + } + out << " Number of edges: " << std::setw(width) << graph.m(); + if (graph.edge_weighted()) { + out << " (total weight: " << graph.total_edge_weight() << ")\n"; + } else { + out << " (unweighted)\n"; + } + out << " Max degree: " << std::setw(width) << graph.max_degree() << '\n'; + out << " Average degree: " << std::setw(width) << avg_deg << '\n'; + out << " Isolated nodes: " << std::setw(width) << isolated_node_count << '\n'; + + cio::print_delimiter("Graph Compression", '-'); + print(ctx.compression, out); +} + +int main(int argc, char *argv[]) { + Context ctx = create_default_context(); + std::string graph_filename; + io::GraphFileFormat graph_file_format = io::GraphFileFormat::METIS; + + CLI::App app("Shared-memory graph properties tool"); + app.add_option("-G,--graph", graph_filename, "Input graph in METIS format")->required(); + app.add_option("-t,--threads", ctx.parallel.num_threads, "Number of threads"); + app.add_option("-f,--graph-file-format", graph_file_format) + ->transform(CLI::CheckedTransformer(io::get_graph_file_formats()).description("")) + ->description(R"(Graph file formats: + - metis + - parhip)") + ->capture_default_str(); + create_graph_compression_options(&app, ctx); + CLI11_PARSE(app, argc, argv); + + tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads); + + Graph graph = io::read( + graph_filename, + graph_file_format, + ctx.compression.enabled, + ctx.compression.may_dismiss, + false, + false + ); + + ctx.debug.graph_name = str::extract_basename(graph_filename); + ctx.compression.setup(graph); + + print_graph_properties(graph, ctx, std::cout); + + return 0; +} diff --git a/apps/tools/shm_graph_rearrangement_tool.cc b/apps/tools/shm_graph_rearrangement_tool.cc new file mode 100644 index 00000000..3a8c77bf --- /dev/null +++ b/apps/tools/shm_graph_rearrangement_tool.cc @@ -0,0 +1,63 @@ +/******************************************************************************* + * CSR graph rearrangement tool for the shared-memory algorithm. + * + * @file: shm_graph_rearrangement_tool.cc + * @author: Daniel Salwasser + * @date: 14.12.2023 + ******************************************************************************/ +// clang-format off +#include +// clang-format on + +#include + +#include "kaminpar-shm/graphutils/permutator.h" + +#include "kaminpar-common/logger.h" + +#include "apps/io/shm_io.h" + +using namespace kaminpar; +using namespace kaminpar::shm; + +int main(int argc, char *argv[]) { + Context ctx = create_default_context(); + ctx.partition.k = 0; + + // Parse CLI arguments + std::string graph_filename; + std::string out_graph_filename; + + CLI::App app("Shared-memory graph rearrangement tool"); + app.add_option("-G,--graph", graph_filename, "Input graph in METIS format")->required(); + app.add_option("-O,--out", out_graph_filename, "Ouput file for saving the rearranged graph") + ->required(); + app.add_option("-t,--threads", ctx.parallel.num_threads, "Number of threads"); + create_partitioning_rearrangement_options(&app, ctx); + CLI11_PARSE(app, argc, argv); + + tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads); + + LOG << "Reading input graph..."; + CSRGraph input_graph = io::metis::csr_read( + graph_filename, ctx.node_ordering == NodeOrdering::IMPLICIT_DEGREE_BUCKETS + ); + + Graph graph(std::make_unique(std::move(input_graph))); + CSRGraph &csr_graph = *dynamic_cast(graph.underlying_graph()); + + LOG << "Rearranging graph..."; + if (ctx.node_ordering == NodeOrdering::DEGREE_BUCKETS) { + graph = graph::rearrange_by_degree_buckets(csr_graph); + graph::integrate_isolated_nodes(graph, ctx.partition.epsilon, ctx); + } + + if (ctx.edge_ordering == EdgeOrdering::COMPRESSION) { + graph::reorder_edges_by_compression(csr_graph); + } + + LOG << "Writing graph..."; + io::metis::write(out_graph_filename, graph); + + return 0; +} diff --git a/external/KaGen b/external/KaGen index 3882802d..2be1e625 160000 --- a/external/KaGen +++ b/external/KaGen @@ -1 +1 @@ -Subproject commit 3882802dd5a336775360157debbf589efd4a024f +Subproject commit 2be1e6257211d1caf02ec7a07a5027ec7a60a63a diff --git a/external/googletest b/external/googletest index 3b6d48e8..5a37b517 160000 --- a/external/googletest +++ b/external/googletest @@ -1 +1 @@ -Subproject commit 3b6d48e8d5c1d9b3f9f10ac030a94008bfaf032b +Subproject commit 5a37b517ad4ab6738556f0284c256cae1466c5b4 diff --git a/external/kassert b/external/kassert index f0873f85..e683aefa 160000 --- a/external/kassert +++ b/external/kassert @@ -1 +1 @@ -Subproject commit f0873f85ff046c6dee35a85148a51bfab73af44a +Subproject commit e683aefaa8e10ca9683a8c5bf1d63ff986f77cdd diff --git a/flake.lock b/flake.lock new file mode 100644 index 00000000..39b28ea8 --- /dev/null +++ b/flake.lock @@ -0,0 +1,61 @@ +{ + "nodes": { + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1710146030, + "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1712963716, + "narHash": "sha256-WKm9CvgCldeIVvRz87iOMi8CFVB1apJlkUT4GGvA0iM=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "cfd6b5fc90b15709b780a5a1619695a88505a176", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "flake-utils": "flake-utils", + "nixpkgs": "nixpkgs" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 00000000..1501084a --- /dev/null +++ b/flake.nix @@ -0,0 +1,55 @@ +{ + description = "Shared-memory and distributed graph partitioner for large k partitioning."; + + inputs = { + nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; + flake-utils.url = "github:numtide/flake-utils"; + }; + + outputs = { self, nixpkgs, flake-utils, ... }: flake-utils.lib.eachDefaultSystem (system: + let + pkgs = import nixpkgs { inherit system; }; + inputs = builtins.attrValues { + inherit (pkgs) cmake ninja python312 gcc13 tbb_2021_11 sparsehash mpi; + }; + in + { + devShells.default = pkgs.mkShell { + packages = inputs ++ builtins.attrValues { + inherit (pkgs) fish ccache gdb; + }; + + shellHook = '' + exec fish + ''; + }; + + devShells.clang = (pkgs.mkShell.override { stdenv = pkgs.llvmPackages_18.stdenv; }) { + packages = (pkgs.lib.lists.remove pkgs.gcc13 inputs) ++ builtins.attrValues { + inherit (pkgs) fish ccache gdb; + }; + + shellHook = '' + exec fish + ''; + }; + + packages.default = pkgs.stdenv.mkDerivation { + pname = "KaMinPar"; + version = "2.1.0"; + + src = self; + nativeBuildInputs = inputs; + + cmakeFlags = [ "-DKAMINPAR_BUILD_DISTRIBUTED=On" ]; + enableParallelBuilding = true; + + meta = { + description = "Shared-memory and distributed graph partitioner for large k partitioning."; + homepage = "https://github.com/KaHIP/KaMinPar"; + license = pkgs.lib.licenses.mit; + }; + }; + } + ); +} diff --git a/kaminpar-cli/kaminpar_arguments.cc b/kaminpar-cli/kaminpar_arguments.cc index 14d98fce..1f67d8e4 100644 --- a/kaminpar-cli/kaminpar_arguments.cc +++ b/kaminpar-cli/kaminpar_arguments.cc @@ -14,6 +14,7 @@ namespace kaminpar::shm { void create_all_options(CLI::App *app, Context &ctx) { + create_graph_compression_options(app, ctx); create_partitioning_options(app, ctx); create_debug_options(app, ctx); create_coarsening_options(app, ctx); @@ -21,6 +22,23 @@ void create_all_options(CLI::App *app, Context &ctx) { create_refinement_options(app, ctx); } +CLI::Option_group *create_graph_compression_options(CLI::App *app, Context &ctx) { + auto *compression = app->add_option_group("Graph Compression"); + + compression->add_flag("-c,--compress", ctx.compression.enabled, "Enable graph compression") + ->default_val(false); + compression + ->add_flag( + "--may-dismiss", + ctx.compression.may_dismiss, + "Whether the compressed graph is only used if it uses less memory than the uncompressed " + "graph." + ) + ->default_val(false); + + return compression; +} + CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx) { auto *partitioning = app->add_option_group("Partitioning"); @@ -52,29 +70,58 @@ CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx) { - async-parallel: diversify initial partitioning by replicating coarse graphs each branch of the replication tree asynchronously - sync-parallel: same as async-parallel, but process branches synchronously)") ->capture_default_str(); + partitioning->add_option( + "--p-deep-initial-partitioning-load", + ctx.partitioning.deep_initial_partitioning_load, + "Fraction of cores that should be used for the coarse graph replication phase of deep MGP. A " + "value of '1' will replicate the graph once for every PE, whereas smaller values lead to " + "fewer replications." + ); partitioning ->add_option( - "--p-deep-initial-partitioning-load", - ctx.partitioning.deep_initial_partitioning_load, - "Fraction of cores that should be used for the coarse graph replication phase of deep " - "MGP. A " - "value of '1' will replicate the graph once for every PE, whereas smaller values lead to " - "fewer replications." + "--p-min-consecutive-seq-bipartitioning-levels", + ctx.partitioning.min_consecutive_seq_bipartitioning_levels, + "(set to '0' for the old behaviour)" ) ->capture_default_str(); - partitioning->add_option("--rearrange-by", ctx.rearrange_by) - ->transform(CLI::CheckedTransformer(get_graph_orderings()).description("")) - ->description(R"(Criteria by which the graph is sorted and rearrange: - - natural: keep order of the graph (do not rearrange) - - deg-buckets: sort nodes by degree bucket and rearrange accordingly)") - ->capture_default_str(); + + create_partitioning_rearrangement_options(app, ctx); return partitioning; } +CLI::Option_group *create_partitioning_rearrangement_options(CLI::App *app, Context &ctx) { + auto *rearrangement = app->add_option_group("Partitioning -> Rearrangement"); + + rearrangement->add_option("--node-order", ctx.node_ordering) + ->transform(CLI::CheckedTransformer(get_node_orderings()).description("")) + ->description(R"(Criteria by which the nodes of the graph are sorted and rearranged: + - natural: keep node order of the graph (do not rearrange) + - deg-buckets: sort nodes by degree bucket and rearrange accordingly + - implicit-deg-buckets: nodes of the input graph are sorted by deg-buckets order)") + ->capture_default_str(); + rearrangement->add_option("--edge-order", ctx.edge_ordering) + ->transform(CLI::CheckedTransformer(get_edge_orderings()).description("")) + ->description(R"(Criteria by which the edges of the graph are sorted and rearranged: + - natural: keep edge order of the graph (do not rearrange) + - compression: sort the edges of each neighbourhood with the ordering of the corresponding compressed graph)" + ) + ->capture_default_str(); + + return rearrangement; +} + CLI::Option_group *create_coarsening_options(CLI::App *app, Context &ctx) { auto *coarsening = app->add_option_group("Coarsening"); + // Coarsening options: + coarsening->add_option("--c-algorithm", ctx.coarsening.algorithm) + ->transform(CLI::CheckedTransformer(get_coarsening_algorithms()).description("")) + ->description(R"(One of the following options: + - noop: disable coarsening + - clustering: coarsening by clustering and contracting)") + ->capture_default_str(); + coarsening ->add_option( "--c-contraction-limit", @@ -83,14 +130,24 @@ CLI::Option_group *create_coarsening_options(CLI::App *app, Context &ctx) { ) ->capture_default_str(); - coarsening->add_option("--c-clustering-algorithm", ctx.coarsening.algorithm) + coarsening + ->add_option( + "--c-convergence-threshold", + ctx.coarsening.convergence_threshold, + "Coarsening converges once the size of the graph shrinks by " + "less than this factor." + ) + ->capture_default_str(); + + // Clustering options: + coarsening->add_option("--c-clustering-algorithm", ctx.coarsening.clustering.algorithm) ->transform(CLI::CheckedTransformer(get_clustering_algorithms()).description("")) ->description(R"(One of the following options: - noop: disable coarsening - lp: size-constrained label propagation)") ->capture_default_str(); - coarsening->add_option("--c-cluster-weight-limit", ctx.coarsening.cluster_weight_limit) + coarsening->add_option("--c-cluster-weight-limit", ctx.coarsening.clustering.cluster_weight_limit) ->transform(CLI::CheckedTransformer(get_cluster_weight_limits()).description("")) ->description( R"(This option selects the formula used to compute the weight limit for nodes in coarse graphs. @@ -106,21 +163,22 @@ Options are: coarsening ->add_option( "--c-cluster-weight-multiplier", - ctx.coarsening.cluster_weight_multiplier, + ctx.coarsening.clustering.cluster_weight_multiplier, "Multiplicator of the maximum cluster weight base value." ) ->capture_default_str(); coarsening ->add_option( - "--c-coarsening-convergence-threshold", - ctx.coarsening.convergence_threshold, - "Coarsening converges once the size of the graph shrinks by " - "less than this factor." + "--c-max-memory-free-coarsening-level", + ctx.coarsening.clustering.max_mem_free_coarsening_level, + "Maximum coarsening level for which the corresponding memory should be released " + "afterwards" ) ->capture_default_str(); create_lp_coarsening_options(app, ctx); + create_contraction_coarsening_options(app, ctx); return coarsening; } @@ -130,24 +188,70 @@ CLI::Option_group *create_lp_coarsening_options(CLI::App *app, Context &ctx) { lp->add_option( "--c-lp-num-iterations", - ctx.coarsening.lp.num_iterations, + ctx.coarsening.clustering.lp.num_iterations, "Maximum number of label propagation iterations" ) ->capture_default_str(); lp->add_option( "--c-lp-active-large-degree-threshold", - ctx.coarsening.lp.large_degree_threshold, + ctx.coarsening.clustering.lp.large_degree_threshold, "Threshold for ignoring nodes with large degree" ) ->capture_default_str(); lp->add_option( "--c-lp-max-num-neighbors", - ctx.coarsening.lp.max_num_neighbors, + ctx.coarsening.clustering.lp.max_num_neighbors, "Limit the neighborhood to this many nodes" ) ->capture_default_str(); - lp->add_option("--c-lp-two-hop-strategy", ctx.coarsening.lp.two_hop_strategy) + lp->add_option( + "--c-lp-use-two-level-cluster-weight-vector", + ctx.coarsening.clustering.lp.use_two_level_cluster_weight_vector, + "Whether to use the two level cluster weight vector" + ) + ->capture_default_str(); + + lp->add_option( + "--c-lp-two-phases", + ctx.coarsening.clustering.lp.use_two_phases, + "Uses two phases in each iteration, where in the second phase the high-degree nodes are " + "treated separately" + ) + ->capture_default_str(); + lp->add_option( + "--c-lp-second-phase-select-mode", ctx.coarsening.clustering.lp.second_phase_select_mode + ) + ->transform(CLI::CheckedTransformer(get_second_phase_select_modes()).description("")) + ->description( + R"(Determines the mode for selecting nodes for the second phase of label propagation. +Options are: + - high-degree: Select nodes with high degree + - full-rating-map: Select nodes which have a full rating map in the first phase + )" + ) + ->capture_default_str(); + lp->add_option( + "--c-lp-second-phase-aggregation-mode", + ctx.coarsening.clustering.lp.second_phase_aggregation_mode + ) + ->transform(CLI::CheckedTransformer(get_second_phase_aggregation_modes()).description("")) + ->description( + R"(Determines the mode for aggregating ratings in the second phase of label propagation. +Options are: + - none: Skip the second phase + - direct: Write the ratings directly into the global vector (shared between threads) + - buffered: Write the ratings into a thread-local buffer and then copy them into the global vector when the buffer is full + )" + ); + lp->add_option( + "--c-lp-second-phase-relabel", + ctx.coarsening.clustering.lp.relabel_before_second_phase, + "Relabel the clusters before running the second phase" + ) + ->capture_default_str(); + + lp->add_option("--c-lp-two-hop-strategy", ctx.coarsening.clustering.lp.two_hop_strategy) ->transform(CLI::CheckedTransformer(get_two_hop_strategies()).description("")) ->description(R"(Determines the strategy for handling singleton clusters during coarsening. Options are: @@ -159,13 +263,15 @@ Options are: ->capture_default_str(); lp->add_option( "--c-lp-two-hop-threshold", - ctx.coarsening.lp.two_hop_threshold, + ctx.coarsening.clustering.lp.two_hop_threshold, "Enable two-hop clustering if plain label propagation shrunk " "the graph by less than this factor" ) ->capture_default_str(); - lp->add_option("--c-lp-isolated-nodes-strategy", ctx.coarsening.lp.isolated_nodes_strategy) + lp->add_option( + "--c-lp-isolated-nodes-strategy", ctx.coarsening.clustering.lp.isolated_nodes_strategy + ) ->transform( CLI::CheckedTransformer(get_isolated_nodes_clustering_strategies()).description("") ) @@ -182,6 +288,32 @@ Options are: return lp; } +CLI::Option_group *create_contraction_coarsening_options(CLI::App *app, Context &ctx) { + auto *contraction = app->add_option_group("Coarsening -> Contraction"); + + contraction->add_option("--c-con-mode", ctx.coarsening.contraction.mode) + ->transform(CLI::CheckedTransformer(get_contraction_modes()).description("")) + ->description(R"(The mode useed for contraction. +Options are: + - edge-buffer: Use an edge buffer to store edges temporarily + - no-edge-buffer-naive: Use no edge buffer by computing the neighborhood of each coarse node twice + - no-edge-buffer-remap: Use no edge buffer by remapping the coarse nodes afterwards + )") + ->capture_default_str(); + contraction + ->add_option( + "--c-con-edge-buffer-fill-fraction", + ctx.coarsening.contraction.edge_buffer_fill_fraction, + "The fraction of the total edges with which to fill the edge buffer" + ) + ->capture_default_str(); + contraction->add_flag( + "--c-con-use-compact-mapping", ctx.coarsening.contraction.use_compact_mapping + ); + + return contraction; +} + CLI::Option_group *create_initial_partitioning_options(CLI::App *app, Context &ctx) { auto *ip = app->add_option_group("Initial Partitioning"); @@ -240,6 +372,36 @@ CLI::Option_group *create_lp_refinement_options(CLI::App *app, Context &ctx) { ) ->capture_default_str(); + lp->add_option( + "--r-lp-two-phases", + ctx.refinement.lp.use_two_phases, + "Uses two phases in each iteration, where in the second phase the high-degree nodes are " + "treated separately" + ) + ->capture_default_str(); + lp->add_option("--r-lp-second-phase-select-mode", ctx.refinement.lp.second_phase_select_mode) + ->transform(CLI::CheckedTransformer(get_second_phase_select_modes()).description("")) + ->description( + R"(Determines the mode for selecting nodes for the second phase of label propagation. +Options are: + - high-degree: Select nodes with high degree + - full-rating-map: Select nodes which have a full rating map in the first phase + )" + ) + ->capture_default_str(); + lp->add_option( + "--r-lp-second-phase-aggregation-mode", ctx.refinement.lp.second_phase_aggregation_mode + ) + ->transform(CLI::CheckedTransformer(get_second_phase_aggregation_modes()).description("")) + ->description( + R"(Determines the mode for aggregating ratings in the second phase of label propagation. +Options are: + - none: Skip the second phase + - direct: Write the ratings directly into the global vector (shared between threads) + - buffered: Write the ratings into a thread-local buffer and then copy them into the global vector when the buffer is full + )" + ); + return lp; } diff --git a/kaminpar-cli/kaminpar_arguments.h b/kaminpar-cli/kaminpar_arguments.h index 9db5de3f..66a6e088 100644 --- a/kaminpar-cli/kaminpar_arguments.h +++ b/kaminpar-cli/kaminpar_arguments.h @@ -11,17 +11,23 @@ #include "kaminpar-cli/CLI11.h" // clang-format on -#include "kaminpar-shm/context.h" +#include namespace kaminpar::shm { void create_all_options(CLI::App *app, Context &ctx); +CLI::Option_group *create_graph_compression_options(CLI::App *app, Context &ctx); + CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx); +CLI::Option_group *create_partitioning_rearrangement_options(CLI::App *app, Context &ctx); + CLI::Option_group *create_coarsening_options(CLI::App *app, Context &ctx); CLI::Option_group *create_lp_coarsening_options(CLI::App *app, Context &ctx); +CLI::Option_group *create_contraction_coarsening_options(CLI::App *app, Context &ctx); + CLI::Option_group *create_initial_partitioning_options(CLI::App *app, Context &ctx); CLI::Option_group *create_refinement_options(CLI::App *app, Context &ctx); diff --git a/kaminpar-common/CMakeLists.txt b/kaminpar-common/CMakeLists.txt index d0bac9bb..8d5b989a 100644 --- a/kaminpar-common/CMakeLists.txt +++ b/kaminpar-common/CMakeLists.txt @@ -24,6 +24,9 @@ find_package(TBB REQUIRED) find_library(NUMA_LIB numa) # optional target_link_libraries(kaminpar_common PUBLIC TBB::tbb TBB::tbbmalloc kassert::kassert) +if (KAMINPAR_BUILD_WITH_GROWT) + target_link_libraries(kaminpar_common PUBLIC growt) +endif () if (NUMA_LIB) target_link_libraries(kaminpar_common PUBLIC ${NUMA_LIB}) endif () diff --git a/kaminpar-common/asserting_cast.h b/kaminpar-common/asserting_cast.h index 244e5b78..8be237e5 100644 --- a/kaminpar-common/asserting_cast.h +++ b/kaminpar-common/asserting_cast.h @@ -58,4 +58,13 @@ template To asserting_cast(const From value) { ); return static_cast(value); } + +template To asserting_cast(const From value) { + KASSERT( + in_range(value), + value << " of type " << typeid(From).name() << " not in range of type " << typeid(To).name(), + assertion_level + ); + return static_cast(value); +} } // namespace kaminpar diff --git a/kaminpar-common/constexpr_utils.h b/kaminpar-common/constexpr_utils.h new file mode 100644 index 00000000..e0c58fc3 --- /dev/null +++ b/kaminpar-common/constexpr_utils.h @@ -0,0 +1,54 @@ +/******************************************************************************* + * Utility functions for constant expressions. + * + * @file: constexpr_utils.h + * @author: Daniel Salwasser + * @date: 29.12.2023 + ******************************************************************************/ +#pragma once + +#include +#include + +namespace kaminpar { + +/*! + * Invokes a function either directly or indirectly depending on a lambda. + * + * @tparam direct Whether to call the function directly. + * @tparam Lambda The type of the lambda to pass to the function. + * @tparam Function The type of the function to invoke. + * @param l The lambda to pass to the function. + * @param fun The function to invoke. + */ +template +constexpr void invoke_maybe_indirect(Lambda &&l, Function &&fun) { + if constexpr (direct) { + fun(std::forward(l)); + } else { + l([&](auto &&l2) { fun(std::forward(l2)); }); + } +} + +// Utility functions for constexpr loops based on https://stackoverflow.com/a/47563100 +template struct Number { + static const constexpr auto value = N; +}; + +template +constexpr void constexpr_for(Lambda &&l, std::index_sequence) { + (l(Number::value), ...); +} + +/*! + * Calls a lambda a specific amount of times with an index. + * + * @tparam N The amount of times to call a lambda. + * @tparam Lambda The type of lambda to call. + * @param l The lambda to call N times with the current number of times called. + */ +template constexpr void constexpr_for(Lambda &&l) { + constexpr_for(std::forward(l), std::make_index_sequence()); +} + +} // namespace kaminpar diff --git a/kaminpar-common/datastructures/binary_heap.h b/kaminpar-common/datastructures/binary_heap.h index 90c8d5da..481dd239 100644 --- a/kaminpar-common/datastructures/binary_heap.h +++ b/kaminpar-common/datastructures/binary_heap.h @@ -474,6 +474,8 @@ class DynamicBinaryForest { Key key; }; + explicit DynamicBinaryForest() {} + explicit DynamicBinaryForest(const std::size_t capacity, const std::size_t heaps) : _id_pos(capacity, kInvalidID), _heaps(heaps) {} @@ -482,6 +484,11 @@ class DynamicBinaryForest { DynamicBinaryForest &operator=(const DynamicBinaryForest &) = delete; DynamicBinaryForest &operator=(DynamicBinaryForest &&) noexcept = default; + void init(const std::size_t capacity, const std::size_t heaps) { + _id_pos.resize(capacity, kInvalidID); + _heaps.resize(heaps); + } + std::size_t capacity() const { return _id_pos.size(); } @@ -654,6 +661,8 @@ using DynamicBinaryMinForest = DynamicBinaryForest class DynamicBinaryMinMaxForest { public: + DynamicBinaryMinMaxForest() {} + DynamicBinaryMinMaxForest(const std::size_t capacity, const std::size_t heaps) : _max_forest(capacity, heaps), _min_forest(capacity, heaps) {} @@ -664,6 +673,11 @@ template class DynamicBinaryMinMaxForest { DynamicBinaryMinMaxForest(DynamicBinaryMinMaxForest &&) noexcept = default; DynamicBinaryMinMaxForest &operator=(DynamicBinaryMinMaxForest &&) noexcept = default; + void init(const std::size_t capacity, const std::size_t heaps) { + _max_forest.init(capacity, heaps); + _min_forest.init(capacity, heaps); + } + [[nodiscard]] inline std::size_t capacity() const { return _max_forest.capacity(); } diff --git a/kaminpar-common/datastructures/compact_static_array.h b/kaminpar-common/datastructures/compact_static_array.h new file mode 100644 index 00000000..5bcda3f5 --- /dev/null +++ b/kaminpar-common/datastructures/compact_static_array.h @@ -0,0 +1,322 @@ +/******************************************************************************* + * A static array which stores integers with only as many bytes as the largest + * integer requires. + * + * @file: compact_static_array.h + * @author: Daniel Salwasser + * @date: 12.01.2024 + ******************************************************************************/ +#pragma once + +#include +#include +#include + +namespace kaminpar { + +/*! + * A static array which stores integers with only as many bytes as the largest integer requires. + * + * @tparam Int The type of integer to store. + */ +template class CompactStaticArray { + static_assert(std::numeric_limits::is_integer); + + class CompactStaticArrayIterator { + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = Int; + using reference = Int &; + using pointer = Int *; + using difference_type = std::ptrdiff_t; + + CompactStaticArrayIterator( + const std::uint8_t byte_width, const Int mask, const std::uint8_t *data + ) + : _byte_width(byte_width), + _mask(mask), + _data(data) {} + + CompactStaticArrayIterator(const CompactStaticArrayIterator &other) = default; + CompactStaticArrayIterator &operator=(const CompactStaticArrayIterator &other) = default; + + Int operator*() const { + return *reinterpret_cast(_data) & _mask; + } + + pointer operator->() const { + return *reinterpret_cast(_data) & _mask; + } + + reference operator[](const difference_type n) const { + return *reinterpret_cast(_data + _byte_width * n) & _mask; + } + + CompactStaticArrayIterator &operator++() { + return _data += _byte_width, *this; + } + + CompactStaticArrayIterator &operator--() { + return _data -= _byte_width, *this; + } + + CompactStaticArrayIterator operator++(int) const { + return CompactStaticArrayIterator{_byte_width, _mask, _data + _byte_width}; + } + + CompactStaticArrayIterator operator--(int) const { + return CompactStaticArrayIterator{_byte_width, _mask, _data - _byte_width}; + } + + CompactStaticArrayIterator operator+(const difference_type n) const { + return CompactStaticArrayIterator{_byte_width, _mask, _data + _byte_width * n}; + } + + CompactStaticArrayIterator operator-(const difference_type n) const { + return CompactStaticArrayIterator{_byte_width, _mask, _data - _byte_width * n}; + } + + CompactStaticArrayIterator &operator+=(const difference_type n) { + return _data += _byte_width * n, *this; + } + + CompactStaticArrayIterator &operator-=(const difference_type n) { + return _data -= _byte_width * n, *this; + } + + difference_type operator+(const CompactStaticArrayIterator &other) const { + return (reinterpret_cast(_data) / _byte_width) + + (reinterpret_cast(other._data) / _byte_width); + } + + difference_type operator-(const CompactStaticArrayIterator &other) const { + return (reinterpret_cast(_data) / _byte_width) - + (reinterpret_cast(other._data) / _byte_width); + } + + bool operator==(const CompactStaticArrayIterator &other) const { + return _data == other._data; + } + + bool operator!=(const CompactStaticArrayIterator &other) const { + return _data != other._data; + } + + bool operator>(const CompactStaticArrayIterator &other) const { + return _data > other._data; + } + + bool operator<(const CompactStaticArrayIterator &other) const { + return _data < other._data; + } + + bool operator>=(const CompactStaticArrayIterator &other) const { + return _data >= other._ptr; + } + + bool operator<=(const CompactStaticArrayIterator &other) const { + return _data <= other._data; + } + + private: + const std::uint8_t _byte_width; + const Int _mask; + const std::uint8_t *_data; + }; + +public: + using value_type = Int; + using size_type = std::size_t; + using reference = value_type &; + using const_reference = const value_type &; + using iterator = CompactStaticArrayIterator; + using const_iterator = const CompactStaticArrayIterator; + + /*! + * Constructs a new CompactStaticArray. + */ + CompactStaticArray() : _byte_width(0), _size(0), _unrestricted_size(0) { + RECORD_DATA_STRUCT(0, _struct); + } + + /*! + * Constructs a new CompactStaticArray. + * + * @param byte_width The number of bytes needed to store the largest integer in the array. + * @param size The number of values to store. + */ + CompactStaticArray(const std::uint8_t byte_width, const std::size_t size) { + KASSERT(byte_width <= 8); + RECORD_DATA_STRUCT(0, _struct); + + resize(byte_width, size); + } + + /*! + * Constructs a new CompactStaticArray. + * + * @param byte_width The number of bytes needed to store the largest integer in the array. + * @param actual_size The number of bytes that the compact representation in memory uses. + * @param data The pointer to the memory location where the data is compactly stored. + */ + CompactStaticArray( + const std::uint8_t byte_width, + const std::size_t actual_size, + std::unique_ptr data + ) + : _byte_width(byte_width), + _size(actual_size), + _values(std::move(data)), + _mask( + (byte_width == 8) ? std::numeric_limits::max() + : (static_cast(1) << (byte_width * 8)) - 1 + ) { + KASSERT(byte_width <= 8); + RECORD_DATA_STRUCT(0, _struct); + } + + CompactStaticArray(const CompactStaticArray &) = delete; + CompactStaticArray &operator=(const CompactStaticArray &) = delete; + + CompactStaticArray(CompactStaticArray &&) noexcept = default; + CompactStaticArray &operator=(CompactStaticArray &&) noexcept = default; + + /*! + * Resizes the array. + * + * @param byte_width The number of bytes needed to store the largest integer in the array. + * @param size The number of values to store. + */ + void resize(const std::uint8_t byte_width, const std::size_t size) { + IF_HEAP_PROFILING( + _struct->size = std::max(_struct->size, byte_width * size + sizeof(Int) - byte_width) + ); + + _byte_width = byte_width; + _size = byte_width * size + sizeof(Int) - byte_width; + _unrestricted_size = _size; + _values = std::make_unique(_size); + _mask = (byte_width == 8) ? std::numeric_limits::max() + : (static_cast(1) << (byte_width * 8)) - 1; + } + + /*! + * Restricts the array to a specific size. This operation can be undone by calling the unrestrict + * method. + * + * @param new_size The number of values to be visible. + */ + void restrict(const std::size_t new_size) { + _unrestricted_size = _size; + _size = _byte_width * new_size + sizeof(Int) - _byte_width; + } + + /*! + * Undos the previous restriction. It does nothing when the restrict method has previously not + * been invoked. + */ + void unrestrict() { + _size = _unrestricted_size; + } + + /*! + * Stores an integer in the array. + * + * @param pos The position in the array at which to store the integer. + * @param value The value to store. + */ + void write(const std::size_t pos, Int value) { + std::uint8_t *data = _values.get() + pos * _byte_width; + + for (std::uint8_t i = 0; i < _byte_width; ++i) { + *data++ = value & 0b11111111; + value >>= 8; + } + } + + /*! + * Accesses an integer in the array. + * + * @param pos The position of the integer in the array to return. + * @return The integer stored at the position in the array. + */ + [[nodiscard]] Int operator[](const std::size_t pos) const { + return *reinterpret_cast(_values.get() + pos * _byte_width) & _mask; + } + + /*! + * Returns an interator to the beginning. + * + * @return An interator to the beginning. + */ + [[nodiscard]] CompactStaticArrayIterator begin() const { + return CompactStaticArrayIterator(_byte_width, _mask, _values.get()); + } + + /*! + * Returns an interator to the end. + * + * @return An interator to the end. + */ + [[nodiscard]] CompactStaticArrayIterator end() const { + return CompactStaticArrayIterator( + _byte_width, _mask, _values.get() + _size - (sizeof(Int) - _byte_width) + ); + } + + /*! + * Returns whether the array is empty. + * + * @return Whether the array is empty. + */ + [[nodiscard]] bool empty() const { + return _size == 0; + } + + /*! + * Returns the amount of integers in the array. + * + * @return The amount of integers in the array. + */ + [[nodiscard]] std::size_t size() const { + return (_size - (sizeof(Int) - _byte_width)) / _byte_width; + } + + /*! + * Returns the number of bytes needed to store the largest integer in the array. + * + * @return The number of bytes needed to store the largest integer in the array. + */ + [[nodiscard]] std::uint8_t byte_width() const { + return _byte_width; + } + + /*! + * Returns the amount of bytes the compact array allocated. + * + * @return The amount of bytes the compact array allocated. + */ + [[nodiscard]] std::size_t allocated_size() const { + return _size; + } + + /*! + * Returns a pointer to the memory location where the data is compactly stored. + * + * @returns A pointer to the memory location where the data is compactly stored. + */ + [[nodiscard]] const std::uint8_t *data() const { + return _values.get(); + } + +private: + std::uint8_t _byte_width; + std::size_t _size; + std::size_t _unrestricted_size; + std::unique_ptr _values; + Int _mask; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); +}; + +}; // namespace kaminpar diff --git a/kaminpar-common/datastructures/concurrent_bit_vector.h b/kaminpar-common/datastructures/concurrent_bit_vector.h new file mode 100644 index 00000000..86b0a1e4 --- /dev/null +++ b/kaminpar-common/datastructures/concurrent_bit_vector.h @@ -0,0 +1,140 @@ +/******************************************************************************* + * A concurrent bit vector which stores bits compactly and uses atomic read/write operations. + * + * @file: concurrent_bit_vector.h + * @author: Daniel Salwasser + * @date: 25.01.2024 + ******************************************************************************/ +#pragma once + +#include + +#include + +#include "kaminpar-common/math.h" + +namespace kaminpar { + +/*! + * A concurrent bit vector which stores bits compactly and uses atomic read/write operations. + * + * @tparam Size The type of index to use to access bits. + */ +template class ConcurrentBitVector { +public: + /*! + * Constructs a new empty ConcurrentBitVector. + */ + ConcurrentBitVector() : _size(0), _byte_capacity(0) {} + + /*! + * Constructs a new ConcurrentBitVector + * + * @param size The number of bits to store. + */ + ConcurrentBitVector(const Size size) + : _size(size), + _byte_capacity(math::div_ceil(size, 8)), + _data(std::make_unique(_byte_capacity)) {} + + ConcurrentBitVector(const ConcurrentBitVector &) = delete; + ConcurrentBitVector &operator=(const ConcurrentBitVector &) = delete; + + ConcurrentBitVector(ConcurrentBitVector &&) noexcept = default; + ConcurrentBitVector &operator=(ConcurrentBitVector &&) noexcept = default; + + /*! + * Atomically loads a bit. + * + * @param pos The position of the bit to load. + * @return Whether the bit is set. + */ + [[nodiscard]] bool load(const Size pos) const noexcept { + KASSERT(pos < _size); + + std::uint8_t *ptr = _data.get() + (pos / 8); + const std::uint8_t mask = 1 << (pos % 8); + return (__atomic_load_n(ptr, __ATOMIC_RELAXED) & mask) != 0; + } + + /*! + * Atomically sets a bit. + * + * @param pos The position of the bit to set. + */ + void set(const Size pos) noexcept { + KASSERT(pos < _size); + + std::uint8_t *ptr = _data.get() + (pos / 8); + const std::uint8_t mask = 1 << (pos % 8); + __atomic_fetch_or(ptr, mask, __ATOMIC_RELAXED); + } + + /*! + * Atomically unsets a bit. + * + * @param pos The position of the bit to unset. + */ + void unset(const Size pos) noexcept { + KASSERT(pos < _size); + + std::uint8_t *ptr = _data.get() + (pos / 8); + const std::uint8_t mask = ~(1 << (pos % 8)); + __atomic_fetch_and(ptr, mask, __ATOMIC_RELAXED); + } + + /*! + * Sets (non-atomically) all bits in the vector. + */ + void set_all() noexcept { + std::fill(_data.get(), _data.get() + _byte_capacity, 0b11111111); + } + + /*! + * Resizes the vector. + * + * @param size The number of bits to store. + */ + void resize(const Size size) { + KASSERT(size > 0); + + _size = size; + _byte_capacity = math::div_ceil(size, 8); + _data = std::make_unique(_byte_capacity); + } + + /*! + * Frees the memory used by this data structure. + */ + void free() { + _size = 0; + _byte_capacity = 0; + _data.release(); + } + + /*! + * Returns the amount of bits that this vector stores. + * + * @return The amount of bits that this vector stores. + */ + [[nodiscard]] Size size() const noexcept { + return _size; + } + + /*! + * Returns the amount of bits that this vector can store, i.e. the size including internal + * fragmentation. + * + * @return The amount of bits that this vector can store. + */ + [[nodiscard]] Size capacity() const noexcept { + return _byte_capacity * 8; + } + +private: + Size _size; + Size _byte_capacity; + std::unique_ptr _data; +}; + +} // namespace kaminpar diff --git a/kaminpar-common/datastructures/concurrent_fast_reset_array.h b/kaminpar-common/datastructures/concurrent_fast_reset_array.h new file mode 100644 index 00000000..da217402 --- /dev/null +++ b/kaminpar-common/datastructures/concurrent_fast_reset_array.h @@ -0,0 +1,120 @@ +/******************************************************************************* + * Static array that can reset used elements in O(# of used elements), similar to FastResetArray. + * But instead of marking an entry as used when it is accessed, it is marked by the user, otherwise + * multiple concurrent accesses to the same value would mark the value as used multiple times. + * + * @file: concurrent_fast_reset_array.h + * @author: Daniel Salwasser + * @date: 29.10.2023 + ******************************************************************************/ +#pragma once + +#include + +#include + +#include "kaminpar-common/heap_profiler.h" +#include "kaminpar-common/parallel/aligned_element.h" + +namespace kaminpar { + +/*! + * A static array that can reset used elements in O(# of used elements). + * + * @tparam Value The type of value to store. + * @tparam Size The type of index to use to access and save values. + */ +template class ConcurrentFastResetArray { +public: + using value_type = Value; + using reference = Value &; + using size_type = Size; + + /*! + * Constructs a new ConcurrentFastResetArray. + * + * @param capacity The capacity of the map, i.e. the amount of values to possibly save. + */ + explicit ConcurrentFastResetArray(const std::size_t capacity = 0) : _data(capacity) { + RECORD_DATA_STRUCT(capacity * sizeof(value_type), _struct); + _used_entries_tls.resize(tbb::this_task_arena::max_concurrency()); + } + + /*! + * Returns the capacity of this array. + * + * @return The capacity of this array. + */ + std::size_t capacity() const { + return _data.capacity(); + } + + /*! + * Returns the thread-local vector of used entries. + * + * @return The thread-local vector of used entries. + */ + [[nodiscard]] std::vector &local_used_entries() { + return _used_entries_tls[tbb::this_task_arena::current_thread_index()].vec; + } + + /*! + * Accesses a value at a position. + * + * @param pos The position of the value in the map to return. It should be greater or equal then + * zero and less then the set capacity. + * @return A reference to the value at the position. + */ + [[nodiscard]] reference operator[](const size_type pos) { + KASSERT(pos < _data.size()); + return _data[pos]; + } + + /*! + * Resized the array. + * + * @param capacity The new capacity of the map, i.e. the amount of values to possibly save. + */ + void resize(const size_type capacity) { + IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, capacity * sizeof(value_type))); + _data.resize(capacity); + _used_entries_tls.resize(tbb::this_task_arena::max_concurrency()); + } + + /*! + * Frees the memory used by this data structure. + */ + void free() { + _data.clear(); + _data.shrink_to_fit(); + + _used_entries_tls.clear(); + _used_entries_tls.shrink_to_fit(); + } + + /*! + * Iterates over all thread-local vector of used entries and clears them afterwards. + * + * @param l The function object that is invoked with a thread-local vector of used entries before + * they are cleared. + */ + template void iterate_and_reset(Lambda &&l) { + tbb::parallel_for(0, _used_entries_tls.size(), [&](const auto i) { + l(i, _used_entries_tls[i]); + + for (const size_type pos : _used_entries_tls[i]) { + _data[pos] = Value(); + } + + _used_entries_tls[i].clear(); + }); + } + +private: + std::vector _data; + std::vector>> _used_entries_tls; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); +}; + +} // namespace kaminpar diff --git a/kaminpar-common/datastructures/concurrent_two_level_vector.h b/kaminpar-common/datastructures/concurrent_two_level_vector.h new file mode 100644 index 00000000..056f2c9f --- /dev/null +++ b/kaminpar-common/datastructures/concurrent_two_level_vector.h @@ -0,0 +1,484 @@ +/******************************************************************************* + * A two-level vector which stores small values in a contiguous vector and large values in a hash + * table. + * + * @file: concurrent_two_level_vector.h + * @author: Daniel Salwasser + * @date: 18.01.2024 + ******************************************************************************/ +#pragma once + +#include + +#include + +#ifdef KAMINPAR_USES_GROWT +#include +#include +#include +#else +#include +#endif + +#include "kaminpar-common/datastructures/static_array.h" + +namespace kaminpar { + +#ifdef KAMINPAR_USES_GROWT +/*! + * A concurrent two-level vector which consists of a vector and a hash table. The data structure + * stores values of small size directly in the vector and bigger values in the hash table. + * + * @tparam Value The type of integer to store. + * @tparam Size The type of integer to access the values with. + * @tparam FirstValue The type of integer to store in the vector. It has to be smaller than the + * value type. + */ +template +class ConcurrentTwoLevelVector { + static_assert(std::numeric_limits::is_integer); + static_assert(std::numeric_limits::is_integer); + static_assert(sizeof(FirstValue) < sizeof(Value)); + + using HasherType = utils_tm::hash_tm::murmur2_hash; + using AllocatorType = ::growt::AlignedAllocator<>; + using ConcurrentHashTable = typename ::growt:: + table_config::table_type; + + // The maximum value of the FirstValue type is used as a special marker in the vector to indicate + // that the value is stored in the hash table. + static constexpr FirstValue kMaxFirstValue = std::numeric_limits::max(); + +public: + /*! + * Constructs a new ConcurrentTwoLevelVector. + * + * @param capacity The capacity of the vector. + */ + ConcurrentTwoLevelVector(const Size capacity = 0) + : _capacity(capacity), + _values(capacity), + _table(0) {} + + ConcurrentTwoLevelVector(const ConcurrentTwoLevelVector &) = delete; + ConcurrentTwoLevelVector &operator=(const ConcurrentTwoLevelVector &) = delete; + + ConcurrentTwoLevelVector(ConcurrentTwoLevelVector &&) noexcept = default; + ConcurrentTwoLevelVector &operator=(ConcurrentTwoLevelVector &&) noexcept = default; + + /*! + * Returns the number of elements that this vector can hold. + * + * @return The number of elements that this vector can hold. + */ + [[nodiscard]] Size capacity() const { + return _capacity; + } + + /*! + * Resizes the vector. + * + * @param capacity The capacity to resize to. + */ + void resize(const Size capacity) { + _values.resize(capacity); + _capacity = capacity; + } + + /*! + * Frees the memory used by this data structure. + */ + void free() { + _values.free(); + _table = ConcurrentHashTable(0); + _capacity = 0; + } + + /*! + * Resets the vector such that new elements can be inserted. + */ + void reset() { + // As growt does not provide a clear function, just create a new hash table. + _table = ConcurrentHashTable(0); + } + + /** + * Reassigns stored values according to a provided mapping. + * + * @param mapping The mapping according to which the values are reassigned. + * @param new_size The new size of the vector. + */ + void reassign(const StaticArray &mapping, const Size new_size) { + StaticArray new_values(new_size); + ConcurrentHashTable new_table(0); + + tbb::parallel_for(tbb::blocked_range(0, _values.size()), [&](const auto &r) { + for (Size pos = r.begin(); pos != r.end(); ++pos) { + const Value value = _values[pos]; + + if (value == kMaxFirstValue) { + Size new_pos = mapping[pos] - 1; + new_values[new_pos] = kMaxFirstValue; + + const Value actual_value = (*_table.get_handle().find(pos)).second; + new_table.get_handle().insert(new_pos, value); + } else if (value != 0) { + Size new_pos = mapping[pos] - 1; + new_values[new_pos] = value; + } + } + }); + + _values = std::move(new_values); + _table = std::move(new_table); + _capacity = new_size; + } + + /*! + * Accesses a value at a given position. + * + * @param pos The position of the value in the vector to return. + * @return The value at the given position. + */ + [[nodiscard]] Value operator[](const Size pos) { + KASSERT(pos < _values.size()); + + const Value value = _values[pos]; + if (value < kMaxFirstValue) { + return value; + } + + auto table_handle = _table.get_handle(); + auto it = table_handle.find(pos); + while (it == table_handle.end()) { + it = table_handle.find(pos); + } + + return (*it).second; + } + + /*! + * Inserts a value at a given position. + * + * @param pos The position in the vector at which the value is to be inserted. + * @param value The value to insert. + */ + void insert(const Size pos, const Value value) { + KASSERT(pos < _values.size()); + + if (value < kMaxFirstValue) { + _values[pos] = value; + } else { + _values[pos] = kMaxFirstValue; + _table.get_handle().insert(pos, value); + } + } + + /** + * Adds atomically a value to a value already stored in the vector. + * + * @param pos The position in the vector at which the value is to be added. + * @param delta The value to add. + */ + void atomic_add(const Size pos, const Value delta) { + KASSERT(pos < _values.size()); + + FirstValue value = _values[pos]; + bool success; + do { + if (value == kMaxFirstValue) { + _table.get_handle().insert_or_update( + pos, delta, [&](auto &lhs, const auto rhs) { return lhs += rhs; }, delta + ); + break; + } + + const Value new_value = static_cast(value) + delta; + if (new_value < kMaxFirstValue) { + success = __atomic_compare_exchange_n( + &_values[pos], &value, new_value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED + ); + } else { + success = __atomic_compare_exchange_n( + &_values[pos], &value, kMaxFirstValue, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED + ); + + if (success) { + _table.get_handle().insert_or_update( + pos, new_value, [&](auto &lhs, const auto rhs) { return lhs += rhs; }, new_value + ); + break; + } + } + + } while (!success); + } + + /** + * Subtracts atomically a value from a value already stored in the vector. + * + * @param pos The position in the vector at which the value is to be subtracted. + * @param delta The value to subtract. + */ + void atomic_sub(const Size pos, const Value delta) { + KASSERT(pos < _values.size()); + + FirstValue value = _values[pos]; + bool success; + do { + if (value == kMaxFirstValue) { + _table.get_handle().insert_or_update( + pos, -delta, [&](auto &lhs, const auto rhs) { return lhs -= rhs; }, delta + ); + break; + } + + success = __atomic_compare_exchange_n( + &_values[pos], &value, value - delta, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED + ); + } while (!success); + } + +private: + Size _capacity; + StaticArray _values; + ConcurrentHashTable _table; +}; +#else +/*! + * A concurrent two-level vector which consists of a vector and a hash table. The data structure + * stores values of small size directly in the vector and bigger values in the hash table. + * + * @tparam Value The type of integer to store. + * @tparam Size The type of integer to access the values with. + * @tparam FirstValue The type of integer to store in the vector. It has to be smaller than the + * value type. + */ +template +class ConcurrentTwoLevelVector { + static_assert(std::numeric_limits::is_integer); + static_assert(std::numeric_limits::is_integer); + static_assert(sizeof(FirstValue) < sizeof(Value)); + + using ConcurrentHashTable = tbb::concurrent_hash_map; + + // The maximum value of the FirstValue type is used as a special marker in the vector to indicate + // that the value is stored in the hash table. + static constexpr FirstValue kMaxFirstValue = std::numeric_limits::max(); + +public: + /*! + * Constructs a new ConcurrentTwoLevelVector. + * + * @param capacity The capacity of the vector. + */ + ConcurrentTwoLevelVector(const Size capacity = 0) : _capacity(capacity), _values(capacity) {} + + ConcurrentTwoLevelVector(const ConcurrentTwoLevelVector &) = delete; + ConcurrentTwoLevelVector &operator=(const ConcurrentTwoLevelVector &) = delete; + + ConcurrentTwoLevelVector(ConcurrentTwoLevelVector &&) noexcept = default; + ConcurrentTwoLevelVector &operator=(ConcurrentTwoLevelVector &&) noexcept = default; + + /*! + * Returns the number of elements that this vector can hold. + * + * @return The number of elements that this vector can hold. + */ + [[nodiscard]] Size capacity() const { + return _capacity; + } + + /*! + * Resizes the vector. + * + * @param capacity The capacity to resize to. + */ + void resize(const Size capacity) { + _values.resize(capacity); + _capacity = capacity; + } + + /*! + * Frees the memory used by this data structure. + */ + void free() { + _values.free(); + _table.clear(); + _capacity = 0; + } + + /*! + * Resets the vector such that new elements can be inserted. + */ + void reset() { + _table.clear(); + } + + /** + * Reassigns stored values according to a provided mapping. + * + * @param mapping The mapping according to which the values are reassigned. + * @param new_size The new size of the vector. + */ + void reassign(const StaticArray &mapping, const Size new_size) { + StaticArray new_values(new_size); + ConcurrentHashTable new_table; + + tbb::parallel_for(tbb::blocked_range(0, _values.size()), [&](const auto &r) { + for (Size pos = r.begin(); pos != r.end(); ++pos) { + const Value value = _values[pos]; + + if (value == kMaxFirstValue) { + Size new_pos = mapping[pos] - 1; + new_values[new_pos] = kMaxFirstValue; + + const Value actual_value = [&] { + typename ConcurrentHashTable::const_accessor entry; + _table.find(entry, pos); + return entry->second; + }(); + + typename ConcurrentHashTable::accessor entry; + new_table.insert(entry, new_pos); + entry->second = actual_value; + } else if (value != 0) { + Size new_pos = mapping[pos] - 1; + new_values[new_pos] = value; + } + } + }); + + _values = std::move(new_values); + _table = std::move(new_table); + _capacity = new_size; + } + + /*! + * Accesses a value at a given position. + * + * @param pos The position of the value in the vector to return. + * @return The value at the given position. + */ + [[nodiscard]] Value operator[](const Size pos) { + KASSERT(pos < _values.size()); + + const Value value = _values[pos]; + if (value < kMaxFirstValue) { + return value; + } + + typename ConcurrentHashTable::const_accessor entry; + bool found; + do { + found = _table.find(entry, pos); + } while (!found); + + return entry->second; + } + + /*! + * Inserts a value at a given position. + * + * @param pos The position in the vector at which the value is to be inserted. + * @param value The value to insert. + */ + void insert(const Size pos, const Value value) { + KASSERT(pos < _values.size()); + + if (value < kMaxFirstValue) { + _values[pos] = value; + } else { + _values[pos] = kMaxFirstValue; + + typename ConcurrentHashTable::accessor entry; + _table.insert(entry, pos); + entry->second = value; + } + } + + /** + * Adds atomically a value to a value already stored in the vector. + * + * @param pos The position in the vector at which the value is to be added. + * @param delta The value to add. + */ + void atomic_add(const Size pos, const Value delta) { + KASSERT(pos < _values.size()); + + FirstValue value = _values[pos]; + bool success; + do { + if (value == kMaxFirstValue) { + typename ConcurrentHashTable::accessor entry; + if (_table.insert(entry, pos)) { + entry->second = delta; + } else { + entry->second += delta; + } + + break; + } + + const Value new_value = static_cast(value) + delta; + if (new_value < kMaxFirstValue) { + success = __atomic_compare_exchange_n( + &_values[pos], &value, new_value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED + ); + } else { + success = __atomic_compare_exchange_n( + &_values[pos], &value, kMaxFirstValue, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED + ); + + if (success) { + typename ConcurrentHashTable::accessor entry; + if (_table.insert(entry, pos)) { + entry->second = new_value; + } else { + entry->second += new_value; + } + + break; + } + } + + } while (!success); + } + + /** + * Subtracts atomically a value from a value already stored in the vector. + * + * @param pos The position in the vector at which the value is to be subtracted. + * @param delta The value to subtract. + */ + void atomic_sub(const Size pos, const Value delta) { + KASSERT(pos < _values.size()); + + FirstValue value = _values[pos]; + bool success; + do { + if (value == kMaxFirstValue) { + typename ConcurrentHashTable::accessor entry; + if (_table.insert(entry, pos)) { + entry->second = -delta; + } else { + entry->second -= delta; + } + + break; + } + + success = __atomic_compare_exchange_n( + &_values[pos], &value, value - delta, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED + ); + } while (!success); + } + +private: + Size _capacity; + StaticArray _values; + ConcurrentHashTable _table; +}; +#endif + +} // namespace kaminpar diff --git a/kaminpar-common/datastructures/fast_reset_array.h b/kaminpar-common/datastructures/fast_reset_array.h index 1779a0dc..87142e8e 100644 --- a/kaminpar-common/datastructures/fast_reset_array.h +++ b/kaminpar-common/datastructures/fast_reset_array.h @@ -11,6 +11,8 @@ #include #include "kaminpar-common/assert.h" +#include "kaminpar-common/datastructures/scalable_vector.h" +#include "kaminpar-common/heap_profiler.h" #include "kaminpar-common/ranges.h" namespace kaminpar { @@ -21,7 +23,9 @@ template class FastResetArray { using const_reference = const Value &; using size_type = Size; - explicit FastResetArray(const std::size_t capacity = 0) : _data(capacity) {} + explicit FastResetArray(const std::size_t capacity = 0) : _data(capacity) { + RECORD_DATA_STRUCT(capacity * sizeof(value_type), _struct); + } FastResetArray(const FastResetArray &) = delete; FastResetArray &operator=(const FastResetArray &) = delete; @@ -30,9 +34,18 @@ template class FastResetArray { reference operator[](const size_type pos) { KASSERT(pos < _data.size()); + if (_data[pos] == Value()) { _used_entries.push_back(pos); + + IF_HEAP_PROFILING( + _struct->size = std::max( + _struct->size, + _data.capacity() * sizeof(value_type) + _used_entries.capacity() * sizeof(size_type) + ) + ); } + return _data[pos]; } const_reference operator[](const size_type pos) const { @@ -90,6 +103,13 @@ template class FastResetArray { } void resize(const std::size_t capacity) { _data.resize(capacity); + + IF_HEAP_PROFILING( + _struct->size = std::max( + _struct->size, + _data.capacity() * sizeof(value_type) + _used_entries.capacity() * sizeof(size_type) + ) + ); } [[nodiscard]] std::size_t memory_in_kb() const { @@ -97,7 +117,9 @@ template class FastResetArray { } private: - std::vector _data; + scalable_vector _data; std::vector _used_entries{}; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); }; } // namespace kaminpar diff --git a/kaminpar-common/datastructures/fixed_size_sparse_map.h b/kaminpar-common/datastructures/fixed_size_sparse_map.h index 6ed5c996..66b62213 100644 --- a/kaminpar-common/datastructures/fixed_size_sparse_map.h +++ b/kaminpar-common/datastructures/fixed_size_sparse_map.h @@ -30,6 +30,7 @@ #include #include "kaminpar-common/assert.h" +#include "kaminpar-common/heap_profiler.h" #include "kaminpar-common/math.h" namespace kaminpar { @@ -72,6 +73,7 @@ class FixedSizeSparseMap { _timestamp(1), _sparse(nullptr), _dense(nullptr) { + RECORD_DATA_STRUCT(0, _struct); allocate(MAP_SIZE); } @@ -83,6 +85,7 @@ class FixedSizeSparseMap { _timestamp(1), _sparse(nullptr), _dense(nullptr) { + RECORD_DATA_STRUCT(0, _struct); allocate(max_size); } @@ -188,6 +191,12 @@ class FixedSizeSparseMap { _sparse = reinterpret_cast(_data.get()); _dense = reinterpret_cast(_data.get() + +sizeof(SparseElement) * _map_size); std::memset(_data.get(), 0, _map_size * (sizeof(Element) + sizeof(SparseElement))); + + IF_HEAP_PROFILING( + _struct->size = std::max( + _struct->size, _map_size * sizeof(Element) + _map_size * sizeof(SparseElement) + ) + ); } } @@ -203,5 +212,7 @@ class FixedSizeSparseMap { std::size_t _timestamp; SparseElement *_sparse; Element *_dense; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); }; } // namespace kaminpar diff --git a/kaminpar-common/datastructures/marker.h b/kaminpar-common/datastructures/marker.h index c444a44d..0391a49e 100644 --- a/kaminpar-common/datastructures/marker.h +++ b/kaminpar-common/datastructures/marker.h @@ -14,14 +14,22 @@ #include "kaminpar-common/assert.h" +#include "kaminpar-common/heap_profiler.h" + namespace kaminpar { template class Marker { public: + explicit Marker() : _marker_id(0), _first_unmarked_element{0} { + RECORD_DATA_STRUCT(0, _struct); + } + explicit Marker(const std::size_t capacity) : _data(capacity), _marker_id(0), - _first_unmarked_element{0} {} + _first_unmarked_element{0} { + RECORD_DATA_STRUCT(capacity * sizeof(element_type), _struct); + } Marker(const Marker &) = delete; Marker &operator=(const Marker &) = delete; @@ -79,6 +87,7 @@ class Marker { } void resize(const std::size_t capacity) { + IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, capacity * sizeof(element_type))); _data.resize(capacity); } @@ -90,5 +99,7 @@ class Marker { std::vector _data; element_type _marker_id; std::array _first_unmarked_element; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); }; } // namespace kaminpar diff --git a/kaminpar-common/datastructures/queue.h b/kaminpar-common/datastructures/queue.h index d644582d..04dd619e 100644 --- a/kaminpar-common/datastructures/queue.h +++ b/kaminpar-common/datastructures/queue.h @@ -10,6 +10,8 @@ #include "kaminpar-common/assert.h" +#include "kaminpar-common/heap_profiler.h" + namespace kaminpar { /*! * Queue with fixed capacity. Add new elements to its tail and remove elements @@ -28,7 +30,9 @@ template class Queue { using iterator = typename std::vector::iterator; using const_iterator = typename std::vector::const_iterator; - explicit Queue(const std::size_t capacity) : _data(capacity) {} + explicit Queue(const std::size_t capacity) : _data(capacity) { + RECORD_DATA_STRUCT(capacity * sizeof(T), _struct); + } Queue(const Queue &) = delete; Queue &operator=(const Queue &) = delete; @@ -88,6 +92,7 @@ template class Queue { } void resize(const std::size_t capacity) { + IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, capacity * sizeof(T))); _data.resize(capacity); clear(); } @@ -125,5 +130,7 @@ template class Queue { std::vector _data; std::size_t _head = 0; std::size_t _tail = 0; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); }; } // namespace kaminpar diff --git a/kaminpar-common/datastructures/rating_map.h b/kaminpar-common/datastructures/rating_map.h index 7b5e1203..d95cebd4 100644 --- a/kaminpar-common/datastructures/rating_map.h +++ b/kaminpar-common/datastructures/rating_map.h @@ -110,7 +110,13 @@ class RatingMap { LARGE }; - explicit RatingMap(const std::size_t max_size) : _max_size{max_size} {} + explicit RatingMap(const std::size_t max_size = 0) : _max_size{max_size} {} + + RatingMap(const RatingMap &) = delete; + RatingMap &operator=(const RatingMap &) = delete; + + RatingMap(RatingMap &&) noexcept = default; + RatingMap &operator=(RatingMap &&) noexcept = default; MapType update_upper_bound(const std::size_t upper_bound_size) { select_map(upper_bound_size); @@ -133,6 +139,10 @@ class RatingMap { __builtin_unreachable(); } + [[nodiscard]] SmallMap &small_map() { + return _small_map; + } + [[nodiscard]] std::size_t small_map_counter() const { return _small_map_counter; } diff --git a/kaminpar-common/datastructures/scalable_vector.h b/kaminpar-common/datastructures/scalable_vector.h index b97d754b..05d602a7 100644 --- a/kaminpar-common/datastructures/scalable_vector.h +++ b/kaminpar-common/datastructures/scalable_vector.h @@ -5,12 +5,22 @@ #include "kaminpar-common/datastructures/noinit_vector.h" namespace kaminpar { +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING +// @deprecated +template using scalable_vector = std::vector; +#else // @deprecated template using scalable_vector = std::vector>; +#endif +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING +// @deprecated +template using scalable_noinit_vector = std::vector>; +#else // @deprecated template using scalable_noinit_vector = std::vector>>; +#endif template using ScalableVector = scalable_vector; diff --git a/kaminpar-common/datastructures/sparse_map.h b/kaminpar-common/datastructures/sparse_map.h index 1e13040b..d5ca6128 100644 --- a/kaminpar-common/datastructures/sparse_map.h +++ b/kaminpar-common/datastructures/sparse_map.h @@ -32,6 +32,8 @@ #include "kaminpar-common/assert.h" +#include "kaminpar-common/heap_profiler.h" + namespace kaminpar { template class SparseMap { struct Element { @@ -40,9 +42,12 @@ template class SparseMap { }; public: - SparseMap() = default; + SparseMap() { + RECORD_DATA_STRUCT(0, _struct); + } explicit SparseMap(const std::size_t capacity) : _capacity(capacity) { + RECORD_DATA_STRUCT(0, _struct); allocate_data(capacity); } @@ -144,6 +149,8 @@ template class SparseMap { _data = std::make_unique(num_elements); _sparse = reinterpret_cast(_data.get()); _dense = reinterpret_cast(_sparse + _capacity); + + IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, num_elements * sizeof(std::size_t))); } std::size_t _capacity = 0; @@ -151,5 +158,7 @@ template class SparseMap { std::unique_ptr _data = nullptr; std::size_t *_sparse = nullptr; Element *_dense = nullptr; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); }; } // namespace kaminpar diff --git a/kaminpar-common/datastructures/static_array.h b/kaminpar-common/datastructures/static_array.h index e1126c2a..2cc29a00 100644 --- a/kaminpar-common/datastructures/static_array.h +++ b/kaminpar-common/datastructures/static_array.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -14,7 +15,7 @@ #include #include "kaminpar-common/assert.h" -#include "kaminpar-common/parallel/atomic.h" +#include "kaminpar-common/heap_profiler.h" #include "kaminpar-common/parallel/tbb_malloc.h" namespace kaminpar { @@ -127,27 +128,27 @@ template class StaticArray { using iterator = StaticArrayIterator; using const_iterator = const StaticArrayIterator; - struct no_init {}; - - StaticArray(T *storage, const std::size_t size) : _size(size), _data(storage) {} + StaticArray(T *storage, const std::size_t size) : _size(size), _data(storage) { + RECORD_DATA_STRUCT(size * sizeof(T), _struct); + } StaticArray(const std::size_t start, const std::size_t size, StaticArray &data) : StaticArray(size, data._data + start) { KASSERT(start + size <= data.size()); } - StaticArray(const std::size_t size, value_type *data) : _size(size), _data(data) {} + StaticArray(const std::size_t size, value_type *data) : _size(size), _data(data) { + RECORD_DATA_STRUCT(size * sizeof(T), _struct); + } StaticArray(const std::size_t size, const value_type init_value = value_type()) { + RECORD_DATA_STRUCT(0, _struct); resize(size, init_value); } - StaticArray(const std::size_t size, no_init) { - resize(size, no_init{}); - } - - StaticArray(static_array::noinit_t, const std::size_t size) { - resize(size, no_init{}); + StaticArray(const std::size_t size, static_array::noinit_t) { + RECORD_DATA_STRUCT(0, _struct); + resize(size, static_array::noinit); } template @@ -174,6 +175,18 @@ template class StaticArray { // Data access members // + void write(const size_type pos, const_reference value) { + at(pos) = value; + } + + reference at(const size_type pos) { + return _data[pos]; + } + + const_reference at(const size_type pos) const { + return _data[pos]; + } + reference operator[](const size_type pos) { KASSERT(pos < _size); return _data[pos]; @@ -270,12 +283,7 @@ template class StaticArray { return _size; } - void resize(static_array::noinit_t, const std::size_t size) { - KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always); - allocate_data(size); - } - - void resize(const std::size_t size, no_init) { + void resize(const std::size_t size, static_array::noinit_t) { KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always); allocate_data(size); } @@ -285,7 +293,7 @@ template class StaticArray { const value_type init_value = value_type(), const bool assign_parallel = true ) { - resize(size, no_init{}); + resize(size, static_array::noinit); assign(size, init_value, assign_parallel); } @@ -319,49 +327,25 @@ template class StaticArray { _data = _owned_data.get(); _size = size; _unrestricted_size = _size; + + IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, size * sizeof(value_type))); } size_type _size = 0; size_type _unrestricted_size = 0; parallel::tbb_unique_ptr _owned_data = nullptr; value_type *_data = nullptr; + + IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct); }; namespace static_array { -template StaticArray copy(const StaticArray &arr) { - StaticArray cpy(arr.size()); - tbb::parallel_for(0, arr.size(), [&](const std::size_t i) { cpy[i] = arr[i]; }); - return cpy; -} - -template StaticArray create_from(const std::vector &vec) { - StaticArray arr(vec.size()); - std::copy(vec.begin(), vec.end(), arr.begin()); - return arr; +template StaticArray create(std::initializer_list list) { + return {list.begin(), list.end()}; } -template -StaticArray> create_atomic_from(const std::vector &vec) { - StaticArray> arr(vec.size()); - for (std::size_t i = 0; i < vec.size(); ++i) { - arr[i].store(vec[i]); - } - return arr; -} - -template std::vector release(const StaticArray &arr) { - std::vector vec(arr.size()); - std::copy(arr.begin(), arr.end(), vec.begin()); - return vec; -} - -template -std::vector release_nonatomic(const StaticArray> &arr) { - std::vector vec(arr.size()); - for (std::size_t i = 0; i < arr.size(); ++i) { - vec[i] = arr[i].load(); - } - return vec; +template StaticArray create(const std::vector &vec) { + return {vec.begin(), vec.end()}; } } // namespace static_array } // namespace kaminpar diff --git a/kaminpar-common/datastructures/ts_navigable_linked_list.h b/kaminpar-common/datastructures/ts_navigable_linked_list.h index c9891673..cee2bd59 100644 --- a/kaminpar-common/datastructures/ts_navigable_linked_list.h +++ b/kaminpar-common/datastructures/ts_navigable_linked_list.h @@ -91,10 +91,16 @@ template typename Container using NavigationMarker = typename LocalNavigableLinkedList::Marker; namespace ts_navigable_list { -template typename Container> -Container> combine( +template < + typename Key, + typename Element, + template + typename Container, + template + typename GlobalContainer> +GlobalContainer> combine( NavigableLinkedList &list, - Container> global_markers = {} + GlobalContainer> global_markers = {} ) { parallel::Atomic global_pos = 0; std::size_t num_markers = 0; diff --git a/kaminpar-common/heap_profiler.cc b/kaminpar-common/heap_profiler.cc new file mode 100644 index 00000000..de86ae3c --- /dev/null +++ b/kaminpar-common/heap_profiler.cc @@ -0,0 +1,332 @@ +/******************************************************************************* + * Heap profiler to measure heap memory usage. + * + * @file: heap_profiler.cc + * @author: Daniel Salwasser + * @date: 21.10.2023 + ******************************************************************************/ +#include "kaminpar-common/heap_profiler.h" + +#include + +#include + +namespace kaminpar::heap_profiler { + +HeapProfiler &HeapProfiler::global() { + static HeapProfiler global("Global Heap Profiler"); + return global; +} + +HeapProfiler::HeapProfiler(std::string_view name) : _tree(name) {} + +HeapProfiler::~HeapProfiler() { + _tree.root.free(_node_allocator, _struct_allocator); +} + +void HeapProfiler::enable() { + _enabled = true; +} + +void HeapProfiler::disable() { + _enabled = false; +} + +void HeapProfiler::start_profile(std::string_view name, std::string desc) { + if (_enabled) { + std::lock_guard guard(_mutex); + + HeapProfileTreeNode *node = _node_allocator.create(name, desc, _tree.currentNode); + _tree.currentNode->children.push_back(node); + _tree.currentNode = node; + } +} + +void HeapProfiler::stop_profile() { + if (_enabled) { + KASSERT(_tree.currentNode->parent != nullptr, "The root heap profile cannot be stopped."); + std::lock_guard guard(_mutex); + + _tree.currentNode = _tree.currentNode->parent; + } +} + +ScopedHeapProfiler HeapProfiler::start_scoped_profile(std::string_view name, std::string desc) { + return ScopedHeapProfiler(name, desc); +} + +void HeapProfiler::record_data_struct( + std::string_view var_name, std::string_view file_name, std::size_t line +) { + if (_enabled) { + _var_name = var_name; + _file_name = file_name; + _line = line; + } +} + +DataStructure *HeapProfiler::add_data_struct(std::string name, std::size_t size) { + if (_enabled) { + std::lock_guard guard(_mutex); + + DataStructure *data_structure = _struct_allocator.create(std::move(name), size); + if (_line != 0) { + data_structure->variable_name = _var_name; + data_structure->file_name = _file_name; + data_structure->line = _line; + + _line = 0; + } + + _tree.currentNode->data_structures.push_back(data_structure); + return data_structure; + } + + return new DataStructure(std::move(name), size); +} + +void HeapProfiler::record_alloc(const void *ptr, std::size_t size) { + if (_enabled) { + std::lock_guard guard(_mutex); + + for (HeapProfileTreeNode *node = _tree.currentNode; node != nullptr; node = node->parent) { + node->allocs++; + node->alloc_size += size; + + if (std::size_t current_alloc = node->alloc_size - node->free_size; + node->alloc_size > node->free_size && current_alloc > node->max_alloc_size) { + node->max_alloc_size = current_alloc; + } + } + + _address_map.insert_or_assign(ptr, size); + } +} + +void HeapProfiler::record_free(const void *ptr) { + if (_enabled) { + std::lock_guard guard(_mutex); + + if (auto search = _address_map.find(ptr); search != _address_map.end()) { + std::size_t size = search->second; + for (HeapProfileTreeNode *node = _tree.currentNode; node != nullptr; node = node->parent) { + node->frees++; + node->free_size += size; + } + + _address_map.erase(search); + } + } +} + +void HeapProfiler::set_detailed_summary_options() { + set_max_depth(std::numeric_limits::max()); + set_print_data_structs(true); + set_min_data_struct_size(1); +} + +void HeapProfiler::set_max_depth(std::size_t max_depth) { + _max_depth = max_depth; +} + +void HeapProfiler::set_print_data_structs(bool print) { + _print_data_structs = print; +} + +void HeapProfiler::set_min_data_struct_size(float size) { + _min_data_struct_size = static_cast(size * 1024 * 1024); +} + +void HeapProfiler::print_heap_profile(std::ostream &out) { + HeapProfileTreeNode &root = *_tree.currentNode; + HeapProfileTreeStats stats(root); + + stats.max_alloc_size = + std::max(kMaxAllocTitle.length(), to_megabytes(stats.max_alloc_size).length()); + stats.alloc_size = std::max(kAllocTitle.length(), to_megabytes(stats.alloc_size).length()); + stats.free_size = std::max(kAllocTitle.length(), to_megabytes(stats.free_size).length()); + stats.allocs = std::max(kAllocsTitle.length(), std::to_string(stats.allocs).length()); + stats.frees = std::max(kFreesTitle.length(), std::to_string(stats.frees).length()); + + out << std::string(stats.len + kNameDel.length() + kPercentageLength - 1, kHeadingPadding) << ' '; + out << kMaxAllocTitle << std::string(stats.max_alloc_size - kMaxAllocTitle.length() + 1, ' '); + out << kAllocTitle << std::string(stats.alloc_size - kAllocTitle.length() + 1, ' '); + out << kFreeTitle << std::string(stats.free_size - kFreeTitle.length() + 1, ' '); + out << kAllocsTitle << std::string(stats.allocs - kAllocsTitle.length() + 1, ' '); + out << kFreesTitle << std::string(stats.frees - kFreesTitle.length() + 1, ' '); + out << '\n'; + + print_heap_tree_node(out, root, stats, _max_depth, _print_data_structs, _min_data_struct_size); + out << '\n'; +} + +std::size_t HeapProfiler::get_max_alloc() { + return _tree.currentNode->max_alloc_size; +} + +std::size_t HeapProfiler::get_alloc() { + return _tree.currentNode->alloc_size; +} + +std::size_t HeapProfiler::get_free() { + return _tree.currentNode->free_size; +} + +std::size_t HeapProfiler::get_allocs() { + return _tree.currentNode->allocs; +} + +std::size_t HeapProfiler::get_frees() { + return _tree.currentNode->frees; +} + +void HeapProfiler::print_heap_tree_node( + std::ostream &out, + const HeapProfileTreeNode &node, + const HeapProfileTreeStats stats, + std::size_t max_depth, + bool print_data_structs, + std::size_t min_data_struct_size, + std::size_t depth, + bool last +) { + if (depth > max_depth) { + return; + } + + print_indentation(out, depth, last); + print_percentage(out, node); + + out << node.name; + + std::size_t padding_length = stats.len - (depth * kBranchLength + node.name.length()); + if (!node.description.empty()) { + padding_length -= node.description.length() + 2; + out << '(' << node.description << ')'; + } + + out << kNameDel; + if (padding_length > 0) { + out << std::string(padding_length - 1, kPadding) << ' '; + } + + print_statistics(out, node, stats); + if (print_data_structs) { + print_data_structures(out, node, depth, node.children.empty(), min_data_struct_size); + } + + if (!node.children.empty()) { + const auto last_child = node.children.back(); + + for (auto const &child : node.children) { + const bool is_last = (child == last_child); + print_heap_tree_node( + out, + *child, + stats, + max_depth, + print_data_structs, + min_data_struct_size, + depth + 1, + is_last + ); + } + } +} + +void HeapProfiler::print_indentation(std::ostream &out, std::size_t depth, bool last) { + if (depth > 0) { + std::size_t leading_whitespaces = (depth - 1) * kBranchLength; + out << std::string(leading_whitespaces, ' ') << (last ? kTailBranch : kBranch); + } +} + +void HeapProfiler::print_percentage(std::ostream &out, const HeapProfileTreeNode &node) { + std::size_t parent_alloc_size = node.parent == nullptr ? 0 : node.parent->alloc_size; + float percentage = (parent_alloc_size == 0) ? 1 : (node.alloc_size / (float)parent_alloc_size); + + out << "("; + + if (percentage >= 0.999995) { + out << "100.00"; + } else { + if (percentage < 0.1) { + out << "0"; + } + + out << percentage * 100; + } + + out << "%) "; +} + +void HeapProfiler::print_statistics( + std::ostream &out, const HeapProfileTreeNode &node, const HeapProfileTreeStats stats +) { + std::string max_alloc_size = to_megabytes(node.max_alloc_size); + out << max_alloc_size << std::string(stats.max_alloc_size - max_alloc_size.length() + 1, ' '); + + std::string alloc_size = to_megabytes(node.alloc_size); + out << alloc_size << std::string(stats.alloc_size - alloc_size.length() + 1, ' '); + + std::string free_size = to_megabytes(node.free_size); + out << free_size << std::string(stats.free_size - free_size.length() + 1, ' '); + + out << node.allocs << std::string(stats.allocs - std::to_string(node.allocs).length() + 1, ' ') + << node.frees << std::string(stats.frees - std::to_string(node.frees).length(), ' ') << '\n'; +} + +void HeapProfiler::print_data_structures( + std::ostream &out, + const HeapProfileTreeNode &node, + std::size_t depth, + bool last, + std::size_t min_data_struct_size +) { + std::vector> filtered_data_structures; + std::copy_if( + node.data_structures.begin(), + node.data_structures.end(), + std::back_inserter(filtered_data_structures), + [&](auto *data_structure) { return data_structure->size >= min_data_struct_size; } + ); + + if (filtered_data_structures.empty()) { + return; + } + + std::sort( + filtered_data_structures.begin(), + filtered_data_structures.end(), + [](auto *d1, auto *d2) { return d1->size > d2->size; } + ); + + auto last_data_structure = filtered_data_structures.back(); + for (auto data_structure : filtered_data_structures) { + const bool is_last = last && (data_structure == last_data_structure); + const bool has_info = data_structure->line > 0; + + std::size_t leading_whitespaces = depth * kBranchLength; + out << std::string(leading_whitespaces, ' ') << (is_last ? kTailBranch : kBranch); + + std::size_t max_alloc_size = node.max_alloc_size; + float percentage = (max_alloc_size == 0) ? 1 : (data_structure->size / (float)max_alloc_size); + if (percentage <= 1) { + out << '(' << (percentage * 100) << "%) "; + } + + out << data_structure->name; + if (has_info) { + out << " \"" << data_structure->variable_name << '\"'; + } + out << " uses " << to_megabytes(data_structure->size) << " mb "; + + if (has_info) { + out << " (" << data_structure->file_name << " at line " << data_structure->line << ')'; + } + + out << '\n'; + } +} + +} // namespace kaminpar::heap_profiler diff --git a/kaminpar-common/heap_profiler.h b/kaminpar-common/heap_profiler.h new file mode 100644 index 00000000..cd82a732 --- /dev/null +++ b/kaminpar-common/heap_profiler.h @@ -0,0 +1,595 @@ +/******************************************************************************* + * Heap profiler to measure heap memory usage. + * + * @file: heap_profiler.h + * @author: Daniel Salwasser + * @date: 21.10.2023 + ******************************************************************************/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kaminpar-common/libc_memory_override.h" + +namespace kaminpar::heap_profiler { + +/*! + * Returns the (demangled) name of a type. + * + * See https://stackoverflow.com/a/25893042 + * + * @tparam T The type whose name to return. + * @return The (demangled) name of the type T. + */ +template std::string type_name() { + auto mangeled_name = typeid(T()).name(); + int status = 0; + + std::unique_ptr demangled_result{ + abi::__cxa_demangle(mangeled_name, NULL, NULL, &status), std::free + }; + + // Strip the trailing brackets from the constructed function type. + std::string name((status == 0) ? demangled_result.get() : mangeled_name); + if (name.substr(name.size() - 3) == " ()") { + name.resize(name.size() - 3); + } + + // Remove the namespace from the type name. + auto it = name.find_last_of("::"); + if (it != std::string::npos) { + name = name.substr(it + 1); + } + + // Remove the asterisk from a this pointer. + if (name.back() == '*') { + name.resize(name.size() - 1); + } + + return name; +} + +}; // namespace kaminpar::heap_profiler + +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING + +// A macro to get the path of a source file in the project directory +// (https://stackoverflow.com/a/40947954) +#ifndef SOURCE_PATH_SIZE +#define SOURCE_PATH_SIZE 0 +#endif + +#define __FILENAME__ ((__FILE__) + (SOURCE_PATH_SIZE)) +#define GET_MACRO(X, Y, Z, FUNC, ...) FUNC + +#define START_HEAP_PROFILER_2(name, desc) \ + kaminpar::heap_profiler::HeapProfiler::global().start_profile(name, desc) +#define START_HEAP_PROFILER_1(name) START_HEAP_PROFILER_2(name, "") +#define START_HEAP_PROFILER(...) \ + GET_MACRO(_, __VA_ARGS__, START_HEAP_PROFILER_2, START_HEAP_PROFILER_1)(__VA_ARGS__) + +#define STOP_HEAP_PROFILER() kaminpar::heap_profiler::HeapProfiler::global().stop_profile() + +#define SCOPED_HEAP_PROFILER_2(name, desc, line) \ + const auto __SCOPED_HEAP_PROFILER__##line = \ + kaminpar::heap_profiler::HeapProfiler::global().start_scoped_profile(name, desc) +#define SCOPED_HEAP_PROFILER_1(name, line) SCOPED_HEAP_PROFILER_2(name, "", line) +#define SCOPED_HEAP_PROFILER(...) \ + GET_MACRO(_, __VA_ARGS__, SCOPED_HEAP_PROFILER_2, SCOPED_HEAP_PROFILER_1)(__VA_ARGS__, __LINE__) + +#define RECORD_DATA_STRUCT_2(size, variable_name) \ + variable_name = kaminpar::heap_profiler::HeapProfiler::global().add_data_struct( \ + kaminpar::heap_profiler::type_name(), size \ + ) +#define RECORD_DATA_STRUCT_1(size) \ + kaminpar::heap_profiler::HeapProfiler::global().add_data_struct( \ + kaminpar::heap_profiler::type_name(), size \ + ) +#define RECORD_DATA_STRUCT(...) \ + GET_MACRO(_, __VA_ARGS__, RECORD_DATA_STRUCT_2, RECORD_DATA_STRUCT_1)(__VA_ARGS__) + +#define RECORD_LOCAL_DATA_STRUCT_2(name, size, variable_name) \ + const auto variable_name = \ + kaminpar::heap_profiler::HeapProfiler::global().add_data_struct(name, size) +#define RECORD_LOCAL_DATA_STRUCT_1(name, size) \ + kaminpar::heap_profiler::HeapProfiler::global().add_data_struct(name, size) +#define RECORD_LOCAL_DATA_STRUCT(...) \ + GET_MACRO(__VA_ARGS__, RECORD_LOCAL_DATA_STRUCT_2, RECORD_LOCAL_DATA_STRUCT_1)(__VA_ARGS__) + +#define RECORD(name) \ + kaminpar::heap_profiler::HeapProfiler::global().record_data_struct(name, __FILENAME__, __LINE__); + +#define IF_HEAP_PROFILING(expression) expression + +#define ENABLE_HEAP_PROFILER() kaminpar::heap_profiler::HeapProfiler::global().enable() +#define DISABLE_HEAP_PROFILER() kaminpar::heap_profiler::HeapProfiler::global().disable() + +#define PRINT_HEAP_PROFILE(out) \ + kaminpar::heap_profiler::HeapProfiler::global().print_heap_profile(out) + +/*! + * Whether heap profiling is enabled. + */ +constexpr bool kHeapProfiling = true; + +#else + +#define START_HEAP_PROFILER(...) +#define STOP_HEAP_PROFILER() +#define SCOPED_HEAP_PROFILER(...) +#define RECORD_DATA_STRUCT(...) +#define RECORD_LOCAL_DATA_STRUCT(...) +#define RECORD(...) +#define IF_HEAP_PROFILING(...) +#define ENABLE_HEAP_PROFILER() +#define DISABLE_HEAP_PROFILER() +#define PRINT_HEAP_PROFILE(...) + +/*! + * Whether heap profiling is enabled. + */ +constexpr bool kHeapProfiling = false; + +#endif + +#ifdef KAMINPAR_ENABLE_PAGE_PROFILING +constexpr bool kPageProfiling = true; +#else +constexpr bool kPageProfiling = false; +#endif + +namespace kaminpar::heap_profiler { + +/*! + * A minimal allocator that uses memory allocation functions which bypass the heap profiler. + * + * This is required for allocations inside the heap profiler, otherwise a memory allocation would + * lead to an infinite recursion. + */ +template struct NoProfilAllocator { + using value_type = T; + + NoProfilAllocator() noexcept {} + template NoProfilAllocator(const NoProfilAllocator &) noexcept {} + + template bool operator==(const NoProfilAllocator &) const noexcept { + return true; + } + template bool operator!=(const NoProfilAllocator &) const noexcept { + return false; + } + + T *allocate(const size_t n) const { + if (n == 0) { + return nullptr; + } + + if (n > static_cast(-1) / sizeof(T)) { + throw std::bad_array_new_length(); + } + +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING + void *const ptr = std_malloc(n * sizeof(T)); +#else + void *const ptr = std::malloc(n * sizeof(T)); +#endif + if (!ptr) { + throw std::bad_alloc(); + } + + return static_cast(ptr); + } + + void deallocate(T *const ptr, size_t) const noexcept { +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING + std_free(ptr); +#else + std::free(ptr); +#endif + } + + template T *create(Args &&...args) const { + T *t = allocate(1); + new (t) T(std::forward(args)...); + return t; + } + + void destruct(T *const t) const { + t->~T(); + deallocate(t, 1); + } +}; + +/*! + * Represents a data structure in the program. It contains information about a data structure that + * is tracked by the heap profiler. + */ +struct DataStructure { + /*! + * The name of the data structure. + */ + std::string name; + + /*! + * The size of the memory in bytes allocated on the heap by the data structure. + */ + std::size_t size; + + /*! + * The name of the variable storing the data structure. It is empty if it is not available. + */ + std::string_view variable_name; + /*! + * The name of the source file of the variable storing the data structure. It is empty if it is + * not available. + */ + std::string_view file_name; + /*! + * The line of the variable storing the data structure. It is zero if it is not available. + */ + std::size_t line; + + /*! + * Constructs a new data structure. + * + * @param name The name of the data structure. + * @param size The size of the memory in bytes allocated on the heap by the data structure. + */ + explicit DataStructure(std::string name, std::size_t size) + : name(std::move(name)), + size(size), + variable_name(""), + file_name(""), + line(0) {} +}; + +class ScopedHeapProfiler; + +/*! + * A hierarchical heap profiler to measure dynamic memory allocation of the program. + * + * The memory allocation operations of libc are overridden to additionally call the global heap + * profiler on each allocation and deallocation request. + */ +class HeapProfiler { +private: + static constexpr std::string_view kMaxAllocTitle = "Peak Memory (mb)"; + static constexpr std::string_view kAllocTitle = "Total Alloc (mb)"; + static constexpr std::string_view kFreeTitle = "Total Free (mb)"; + static constexpr std::string_view kAllocsTitle = "Allocs"; + static constexpr std::string_view kFreesTitle = "Frees"; + + static constexpr std::string_view kBranch = "|- "; + static constexpr std::string_view kTailBranch = "`- "; + static constexpr std::string_view kTailEdge = " "; + static constexpr std::string_view kNameDel = ": "; + static constexpr char kHeadingPadding = '-'; + static constexpr char kPadding = '.'; + + static constexpr std::size_t kBranchLength = 3; + static constexpr std::size_t kPercentageLength = 10; + static constexpr std::size_t kDataStructSizeThreshold = 1024; + + static std::string to_megabytes(std::size_t bytes) { + std::stringstream stream; + stream << std::fixed << std::setprecision(2) << (bytes / (float)(1024 * 1024)); + return stream.str(); + } + + struct HeapProfileTreeNode { + std::string_view name; + std::string description; + + HeapProfileTreeNode *parent; + std::vector> children; + + std::size_t max_alloc_size; + std::size_t alloc_size; + std::size_t free_size; + std::size_t allocs; + std::size_t frees; + + std::vector> data_structures; + + HeapProfileTreeNode(std::string_view name, std::string description, HeapProfileTreeNode *parent) + : name(name), + description(description), + parent(parent), + max_alloc_size(0), + alloc_size(0), + free_size(0), + allocs(0), + frees(0) {} + + template + void free(NodeAllocator node_allocator, DataStructAllocator data_struct_allocator) { + for (DataStructure *data_structure : data_structures) { + data_struct_allocator.destruct(data_structure); + } + + for (HeapProfileTreeNode *child : children) { + child->free(node_allocator, data_struct_allocator); + node_allocator.destruct(child); + } + } + }; + + struct HeapProfileTree { + HeapProfileTreeNode root; + HeapProfileTreeNode *currentNode; + + HeapProfileTree(std::string_view name) : root(name, "", nullptr), currentNode(&root) {} + }; + + struct HeapProfileTreeStats { + std::size_t len; + std::size_t max_alloc_size; + std::size_t alloc_size; + std::size_t free_size; + std::size_t allocs; + std::size_t frees; + + HeapProfileTreeStats(const HeapProfileTreeNode &node) { + std::size_t name_length = node.name.length(); + if (!node.description.empty()) { + name_length += node.description.length() + 2; + } + + len = name_length; + max_alloc_size = node.max_alloc_size; + alloc_size = node.alloc_size; + free_size = node.free_size; + allocs = node.allocs; + frees = node.frees; + + for (auto const &child : node.children) { + HeapProfileTreeStats child_stats(*child); + + len = std::max(len, child_stats.len + kBranchLength); + max_alloc_size = std::max(max_alloc_size, child_stats.max_alloc_size); + alloc_size = std::max(alloc_size, child_stats.alloc_size); + free_size = std::max(free_size, child_stats.free_size); + allocs = std::max(allocs, child_stats.allocs); + frees = std::max(frees, child_stats.frees); + } + } + }; + +public: + /** + * Returns the global heap profiler. + * + * @return The global heap profiler. + */ + static HeapProfiler &global(); + + /*! + * Constructs a new heap profiler. + * + * @param name The name of the heap profiler and the name of the root profile. + */ + explicit HeapProfiler(std::string_view name); + + /*! + * Destroys the heap profiler. + */ + ~HeapProfiler(); + + /*! + * Starts profiling the heap. + */ + void enable(); + + /*! + * Stops profiling the heap. + */ + void disable(); + + /** + * Starts a new profile, adds it as a child profile to the current profile, and sets it to the + * current profile. + * + * @param name The name of the profile to start. + * @param desc The description of the profile to start. + */ + void start_profile(std::string_view name, std::string desc); + + /*! + * Stops the current profile and sets the new current profile to the parent profile. + */ + void stop_profile(); + + /*! + * Starts a scoped heap profile and returns the associated object. + * + * @param name The name of the profile to start. + * @param desc The description of the profile to start. + */ + ScopedHeapProfiler start_scoped_profile(std::string_view name, std::string desc); + + /*! + * Records information about the variable storing the next data structure that is added to the + * heap profiler. + * + * @param var_name The name of the variable storing the data structure. + * @param file_name The name of the source file of the variable storing the data structure. + * @param line The line of the variable storing the data structure. + */ + void record_data_struct(std::string_view var_name, std::string_view file_name, std::size_t line); + + /*! + * Adds a data structure to track to the current profile of the heap profiler. If information + * about the variable that stores the data structure has been recorded by the heap profiler, it is + * added. + * + * @param name The name of the data structure. + * @param size The size of the memory in bytes allocated on the heap by the data structure. + * @return A pointer to the object holding information about the data structure or a nullptr if + * the heap profiler is disabled. + */ + DataStructure *add_data_struct(std::string name, std::size_t size); + + /*! + * Records a memory allocation. + * + * @param ptr The pointer to the beginning of the allocated memory. + * @param size The number allocated bytes. + */ + void record_alloc(const void *ptr, std::size_t size); + + /*! + * Records a memory deallocation. + * + * @param ptr The pointer to the beginning of the allocated memory + */ + void record_free(const void *ptr); + + /*! + * Sets the options such that the printed summary contains detailed information. + */ + void set_detailed_summary_options(); + + /*! + * Sets the maximum depth shown in the summary. + * + * @param max_depth The maximum depth shown in the summary. + */ + void set_max_depth(std::size_t max_depth); + + /*! + * Sets the option whether to print data structure memory statistics in the summary. + * + * @param print Whether to print data structure memory statistics in the summary. + */ + void set_print_data_structs(bool print); + + /*! + * Sets the minimum size of a data structure in MB to be included in the summary. + * + * @param size The minimum size of a data structure in MB to be included in the summary. + */ + void set_min_data_struct_size(float size); + + /*! + * Prints information about the heap profile to the output stream. + * + * @param out The output stream to write to. + */ + void print_heap_profile(std::ostream &out); + + /*! + * Returns the amount of maximum allocated memory in bytes of the current heap profile. + * + * @return The amount of maximum allocated memory in bytes of the current heap profile. + */ + std::size_t get_max_alloc(); + + /*! + * Returns the amount of allocated memory in bytes of the current heap profile. + * + * @return The amount of allocated memory in bytes of the current heap profile. + */ + std::size_t get_alloc(); + + /*! + * Returns the amount of freed memory in bytes of the current heap profile. + * + * @return The amount of freed memory in bytes of the current heap profile. + */ + std::size_t get_free(); + + /*! + * Returns the amount of alloc operations of the current heap profile. + * + * @return The amount of alloc operations of the current heap profile. + */ + std::size_t get_allocs(); + + /*! + * Returns the amount of free operations of the current heap profile. + * + * @return The amount of free operations of the current heap profile. + */ + std::size_t get_frees(); + +private: + bool _enabled = false; + std::mutex _mutex; + + NoProfilAllocator _node_allocator; + HeapProfileTree _tree; + std::unordered_map< + const void *, + std::size_t, + std::hash, + std::equal_to, + NoProfilAllocator>> + _address_map; + + NoProfilAllocator _struct_allocator; + std::string_view _var_name; + std::string_view _file_name; + std::size_t _line; + + std::size_t _max_depth = std::numeric_limits::max(); + bool _print_data_structs = true; + std::size_t _min_data_struct_size = 0; + + static void print_heap_tree_node( + std::ostream &out, + const HeapProfileTreeNode &node, + const HeapProfileTreeStats stats, + std::size_t max_depth, + bool print_data_structs, + std::size_t min_data_struct_size, + std::size_t depth = 0, + bool last = false + ); + static void print_indentation(std::ostream &out, std::size_t depth, bool last); + static void print_percentage(std::ostream &out, const HeapProfileTreeNode &node); + static void print_statistics( + std::ostream &out, const HeapProfileTreeNode &node, const HeapProfileTreeStats stats + ); + static void print_data_structures( + std::ostream &out, + const HeapProfileTreeNode &node, + std::size_t depth, + bool last, + std::size_t min_data_struct_size + ); +}; + +/*! + * A helper class for scoped heap profiling. The profile starts with the construction of the object + * and ends with the destruction of the object. + */ +class ScopedHeapProfiler { +public: + /*! + * Constructs a new scoped heap profiler and thereby starting a new heap profile. + * + * @param name The name of the started profile. + * @param description The description of the started profile. + */ + ScopedHeapProfiler(std::string_view name, std::string description) { + HeapProfiler::global().start_profile(name, description); + } + + /*! + * Deconstructs the scoped heap profiler and thereby stopping the heap profile. + */ + inline ~ScopedHeapProfiler() { + HeapProfiler::global().stop_profile(); + } +}; + +} // namespace kaminpar::heap_profiler diff --git a/kaminpar-common/libc_memory_override.cc b/kaminpar-common/libc_memory_override.cc new file mode 100644 index 00000000..12508aaa --- /dev/null +++ b/kaminpar-common/libc_memory_override.cc @@ -0,0 +1,126 @@ +/******************************************************************************* + * This file overwrites the memory allocation operations of libc with operations that additionally + * invoke the heap profiler. + * + * @file: libc_memory_override.cc + * @author: Daniel Salwasser + * @date: 22.10.2023 + ******************************************************************************/ +#include "kaminpar-common/libc_memory_override.h" + +#include + +#include "kaminpar-common/heap_profiler.h" + +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING +#ifdef __GLIBC__ +extern "C" { + +using kaminpar::heap_profiler::HeapProfiler; + +extern void *__libc_malloc(size_t); +extern void *__libc_calloc(size_t, size_t); +extern void *__libc_realloc(void *, size_t); +extern void *__libc_free(void *); +extern void *__libc_memalign(size_t, size_t); +extern void *__libc_valloc(size_t); +extern void *__libc_pvalloc(size_t); +extern void *__libc_realloc(void *, size_t); + +void *malloc(size_t size) { + void *ptr = __libc_malloc(size); + HeapProfiler::global().record_alloc(ptr, size); + return ptr; +}; + +void *calloc(size_t size, size_t n) { + void *ptr = __libc_calloc(size, n); + HeapProfiler::global().record_alloc(ptr, size * n); + return ptr; +} + +void *realloc(void *p, size_t newsize) { + void *ptr = __libc_realloc(p, newsize); + HeapProfiler::global().record_free(p); + HeapProfiler::global().record_alloc(ptr, newsize); + return ptr; +} + +void free(void *p) { + __libc_free(p); + HeapProfiler::global().record_free(p); +} + +void *aligned_alloc(size_t alignment, size_t size) { + // Since glibc does not define aligned_alloc as a weak symbol to e.g. __libc_aligned_alloc, unlike + // other functions, __libc_memalign is called instead with a check for valid alignment. + bool is_power_of_2 = (alignment & (alignment - 1)) == 0; + if (!is_power_of_2 || alignment == 0) { + errno = EINVAL; + return 0; + } + + void *ptr = __libc_memalign(alignment, size); + HeapProfiler::global().record_alloc(ptr, size); + return ptr; +} + +void *memalign(size_t alignment, size_t size) { + void *ptr = __libc_memalign(alignment, size); + HeapProfiler::global().record_alloc(ptr, size); + return ptr; +} + +void *valloc(size_t size) { + void *ptr = __libc_valloc(size); + HeapProfiler::global().record_alloc(ptr, size); + return ptr; +} + +void *pvalloc(size_t size) { + void *ptr = __libc_pvalloc(size); + HeapProfiler::global().record_alloc(ptr, size); + return ptr; +} + +#ifdef KAMINPAR_ENABLE_PAGE_PROFILING +extern void *__mmap(void *, size_t, int, int, int, off_t); +extern int __munmap(void *, size_t); + +void *mmap(void *addr, size_t len, int prot, int flags, int fd, __off_t offset) { + void *ptr = __mmap(addr, len, prot, flags, fd, offset); + HeapProfiler::global().record_alloc(addr, len); + return ptr; +} + +int munmap(void *addr, size_t len) { + int return_value = __munmap(addr, len); + HeapProfiler::global().record_free(addr); + return return_value; +} +#endif +} +#else +#error Heap profiling is only supported for systems that are using glibc. +#endif +#endif + +namespace kaminpar::heap_profiler { + +void *std_malloc(std::size_t size) { +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING + return __libc_malloc(size); +#else + return std::malloc(size); +#endif +} + +void std_free(void *ptr) { +#ifdef KAMINPAR_ENABLE_HEAP_PROFILING + __libc_free(ptr); +#else + std::free(ptr); +#endif +} + +} // namespace kaminpar::heap_profiler diff --git a/kaminpar-common/libc_memory_override.h b/kaminpar-common/libc_memory_override.h new file mode 100644 index 00000000..1d69a4fe --- /dev/null +++ b/kaminpar-common/libc_memory_override.h @@ -0,0 +1,33 @@ +/******************************************************************************* + * This file overwrites the memory allocation operations of libc with operations that additionally + * invoke the heap profiler. + * + * @file: libc_memory_override.h + * @author: Daniel Salwasser + * @date: 22.10.2023 + ******************************************************************************/ +#pragma once + +#include + +namespace kaminpar::heap_profiler { + +/*! + * Allocates size bytes of uninitialized memory. The allocation request is directly forwarded to + * malloc and thus not captured by the heap profiler. + * + * @param size The number of bytes to allocate. + * + * @return Returns the pointer to the beginning of newly allocated memory on success, otherwise a + * null pointer. + */ +void *std_malloc(std::size_t size); + +/*! + * Deallocates the memory previously allocated by std_malloc. + * + * @param ptr The pointer to the memory to be deallocated. + */ +void std_free(void *ptr); + +} // namespace kaminpar::heap_profiler diff --git a/kaminpar-common/math.h b/kaminpar-common/math.h index 45b76b16..461f7273 100644 --- a/kaminpar-common/math.h +++ b/kaminpar-common/math.h @@ -17,6 +17,44 @@ #include "kaminpar-common/assert.h" namespace kaminpar::math { + +/*! + * Divides two integers with ceil rounding. + * + * @param x The dividend which has to be non-zero. + * @param y The divisor. + * @return The ceiling of x divided by y. + */ +template constexpr std::size_t abs(Int value) { + if (value < 0) { + value *= -1; + } + + return static_cast(value); +} + +/*! + * Returns the absolute difference between two (possibly unsigned) integers. + * + * @param x The first integer. + * @param y The second integer. + * @return The absolute difference of x and y. + */ +template constexpr std::size_t abs_diff(const Int1 x, const Int2 y) { + return x > y ? x - y : y - x; +} + +/*! + * Divides two integers with ceil rounding. + * + * @param x The dividend which has to be non-zero. + * @param y The divisor. + * @return The ceiling of x divided by y. + */ +template constexpr Int1 div_ceil(const Int1 x, const Int2 y) { + return 1 + ((x - 1) / y); +} + template bool is_square(const Int value) { const Int sqrt = std::sqrt(value); return sqrt * sqrt == value; @@ -58,6 +96,15 @@ template T ceil2(const T arg) { return 1 << ceil_log2(arg); } +template constexpr Int byte_width(const Int i) { + if (i == 0) { + return 1; + } + + const Int bit_width = 1 + floor_log2(i); + return div_ceil(bit_width, 8); +} + template double percentile(const std::vector &sorted_sequence, const double percentile) { KASSERT([&] { @@ -75,7 +122,8 @@ double percentile(const std::vector &sorted_sequence, const double percentile template auto split_integral(const T value, const double ratio = 0.5) { return std::pair{ - static_cast(std::ceil(value * ratio)), static_cast(std::floor(value * (1.0 - ratio)))}; + static_cast(std::ceil(value * ratio)), static_cast(std::floor(value * (1.0 - ratio))) + }; } /** diff --git a/kaminpar-common/parallel/aligned_element.h b/kaminpar-common/parallel/aligned_element.h index 828965de..bf1d4bea 100644 --- a/kaminpar-common/parallel/aligned_element.h +++ b/kaminpar-common/parallel/aligned_element.h @@ -9,6 +9,7 @@ #include namespace kaminpar::parallel { + template struct alignas(64) Aligned { Value value; @@ -33,4 +34,44 @@ template struct alignas(64) Aligned { return value != other; } }; + +template struct alignas(64) AlignedVec { + Vector vec; + + AlignedVec() : vec() {} + AlignedVec(Vector vec) : vec(vec) {} + + decltype(auto) operator[](std::size_t pos) { + return vec[pos]; + } + + decltype(auto) operator[](std::size_t pos) const { + return vec[pos]; + } + + decltype(auto) begin() noexcept { + return vec.begin(); + } + + decltype(auto) begin() const noexcept { + return vec.begin(); + } + + decltype(auto) end() noexcept { + return vec.end(); + } + + decltype(auto) end() const noexcept { + return vec.end(); + } + + void clear() noexcept { + vec.clear(); + } + + void resize(std::size_t count) { + vec.resize(count); + } +}; + } // namespace kaminpar::parallel diff --git a/kaminpar-common/parallel/tbb_malloc.h b/kaminpar-common/parallel/tbb_malloc.h index 9edf1ca0..421b6052 100644 --- a/kaminpar-common/parallel/tbb_malloc.h +++ b/kaminpar-common/parallel/tbb_malloc.h @@ -11,30 +11,34 @@ #include #include "kaminpar-common/assert.h" +#include "kaminpar-common/heap_profiler.h" namespace kaminpar::parallel { template struct tbb_deleter { void operator()(T *p) { scalable_free(p); + + if constexpr (kHeapProfiling && !kPageProfiling) { + heap_profiler::HeapProfiler::global().record_free(p); + } } }; template using tbb_unique_ptr = std::unique_ptr>; +// template using tbb_unique_ptr = std::unique_ptr; template tbb_unique_ptr make_unique(const std::size_t size) { auto nbytes = sizeof(T) * size; T *ptr = static_cast(scalable_malloc(nbytes)); + KASSERT( - ptr != nullptr, - "probably out of memory after attemping to allocate " << nbytes << " bytes", - assert::light + ptr != nullptr, "out of memory: could not allocate " << nbytes << " bytes", assert::light ); - return tbb_unique_ptr(ptr, tbb_deleter{}); -} -template tbb_unique_ptr make_unique(Args &&...args) { - void *memory = static_cast(scalable_malloc(sizeof(T))); - T *ptr = new (memory) T(std::forward(args)...); + if constexpr (kHeapProfiling && !kPageProfiling) { + heap_profiler::HeapProfiler::global().record_alloc(ptr, sizeof(T) * size); + } + return tbb_unique_ptr(ptr, tbb_deleter{}); } } // namespace kaminpar::parallel diff --git a/kaminpar-common/ranges.h b/kaminpar-common/ranges.h index 4bcfa5b3..e69799a3 100644 --- a/kaminpar-common/ranges.h +++ b/kaminpar-common/ranges.h @@ -7,6 +7,7 @@ ******************************************************************************/ #pragma once +#include #include #include diff --git a/kaminpar-common/varint_codec.cc b/kaminpar-common/varint_codec.cc new file mode 100644 index 00000000..d2bfed3c --- /dev/null +++ b/kaminpar-common/varint_codec.cc @@ -0,0 +1,32 @@ +/******************************************************************************* + * Encoding and decoding methods for VarInts. + * + * @file: varint_codec.cc + * @author: Daniel Salwasser + * @date: 26.12.2023 + ******************************************************************************/ +#include "kaminpar-common/varint_codec.h" + +namespace kaminpar { + +namespace debug { + +static VarIntStats stats = {0, 0, 0, 0, 0, 0}; + +void varint_stats_reset() { + stats.varint_count = 0; + stats.signed_varint_count = 0; + stats.marked_varint_count = 0; + + stats.varint_bytes = 0; + stats.signed_varint_bytes = 0; + stats.marked_varint_bytes = 0; +} + +VarIntStats &varint_stats_global() { + return stats; +} + +} // namespace debug + +} // namespace kaminpar diff --git a/kaminpar-common/varint_codec.h b/kaminpar-common/varint_codec.h new file mode 100644 index 00000000..5ee0158e --- /dev/null +++ b/kaminpar-common/varint_codec.h @@ -0,0 +1,556 @@ +/******************************************************************************* + * Encoding and decoding methods for VarInts. + * + * @file: varint_codec.h + * @author: Daniel Salwasser + * @date: 11.11.2023 + ******************************************************************************/ +#pragma once + +#include +#include +#include + +#include + +namespace kaminpar { + +namespace debug { + +/*! + * Whether to track statistics on encoded VarInts. + */ +static constexpr bool kTrackVarintStats = false; + +/*! + * Statistics about encoded VarInts. + */ +struct VarIntStats { + std::size_t varint_count; + std::size_t signed_varint_count; + std::size_t marked_varint_count; + + std::size_t varint_bytes; + std::size_t signed_varint_bytes; + std::size_t marked_varint_bytes; +}; + +/*! + * Reset the global statistics on encoded VarInts. + */ +void varint_stats_reset(); + +/*! + * Returns a reference to the global statistics on encoded VarInts. + * + * @return A reference to the global statistics on encoded VarInts. + */ +VarIntStats &varint_stats_global(); + +} // namespace debug + +/*! + * Encodes a signed integer using zigzag encoding. + * + * @param i The signed integer to encode. + * @return The encoded integer. + */ +template [[nodiscard]] std::make_unsigned_t zigzag_encode(Int i) { + return (i >> (sizeof(Int) * 8 - 1)) ^ (i << 1); +} + +/*! + * Decodes a zigzag encoded integer. + * + * @param i The zigzag encoded integer to decode. + * @return The decoded integer. + */ +template [[nodiscard]] std::make_signed_t zigzag_decode(Int i) { + return (i >> 1) ^ -(i & 1); +} + +/*! + * Returns the maximum number of bytes that a VarInt needs to be stored. + * + * @tparam Int The type of integer whose encoded maximum length is returned. + */ +template [[nodiscard]] constexpr std::size_t varint_max_length() { + return (sizeof(Int) * 8) / 7 + 1; +} + +/*! + * Returns the number of bytes a VarInt needs to be stored. + * + * @tparam Int The type of integer whose encoded length is returned. + * @param Int The integer to store. + * @return The number of bytes the integer needs to be stored. + */ +template [[nodiscard]] std::size_t varint_length(Int i) { + std::size_t len = 1; + + while (i > 0b01111111) { + i >>= 7; + len++; + } + + return len; +} + +/*! + * Returns the number of bytes a signed VarInt needs to be stored. + * + * @tparam Int The type of integer whose encoded length is returned. + * @param Int The integer to store. + * @return The number of bytes the integer needs to be stored. + */ +template [[nodiscard]] std::size_t signed_varint_length(Int i) { + return varint_length(zigzag_encode(i)); +} + +/*! + * Returns the number of bytes a marked VarInt needs to be stored. + * + * @tparam Int The type of integer whose encoded length is returned. + * @param Int The integer to store. + * @return The number of bytes the integer needs to be stored. + */ +template [[nodiscard]] std::size_t marked_varint_length(Int i) { + std::size_t len = 1; + + i >>= 6; + if (i > 0) { + len += varint_length(i); + } + + return len; +} + +/*! + * Writes an integer to a memory location as a VarInt. + * + * @tparam Int The type of integer to encode. + * @param Int The integer to store. + * @param ptr The pointer to the memory location to write the integer to. + * @return The number of bytes that the integer occupies at the memory location. + */ +template std::size_t varint_encode(Int i, std::uint8_t *ptr) { + std::size_t len = 1; + + while (i > 0b01111111) { + std::uint8_t octet = (i & 0b01111111) | 0b10000000; + *ptr = octet; + + i >>= 7; + ptr++; + len++; + } + + std::uint8_t last_octet = i & 0b01111111; + *ptr = last_octet; + + if (debug::kTrackVarintStats) { + debug::varint_stats_global().varint_count++; + debug::varint_stats_global().varint_bytes += len; + } + + return len; +} + +/*! + * Writes an integer to a memory location as a signed VarInt. + * + * @tparam Int The type of integer to encode. + * @param Int The integer to store. + * @param ptr The pointer to the memory location to write the integer to. + * @return The number of bytes that the integer occupies at the memory location. + */ +template std::size_t signed_varint_encode(Int i, std::uint8_t *ptr) { + const std::size_t len = varint_encode(zigzag_encode(i), ptr); + + if (debug::kTrackVarintStats) { + debug::varint_stats_global().signed_varint_count++; + debug::varint_stats_global().signed_varint_bytes += len; + } + + return len; +} + +/*! + * Writes an integer to a memory location as a marked VarInt. + * + * @tparam Int The type of integer to encode. + * @param Int The integer to store. + * @param marker_set Whether the integer is marked. + * @param ptr The pointer to the memory location to write the integer to. + * @return The number of bytes that the integer occupies at the memory location. + */ +template +std::size_t marked_varint_encode(Int i, bool marker_set, std::uint8_t *ptr) { + std::uint8_t first_octet; + + if (marker_set) { + first_octet = (i & 0b00111111) | 0b01000000; + } else { + first_octet = (i & 0b00111111); + } + + i >>= 6; + + if (i > 0) { + first_octet |= 0b10000000; + *ptr = first_octet; + + std::size_t len = varint_encode(i, ptr + 1) + 1; + + if (debug::kTrackVarintStats) { + debug::varint_stats_global().marked_varint_count++; + debug::varint_stats_global().marked_varint_bytes += len; + } + + return len; + } + + if (debug::kTrackVarintStats) { + debug::varint_stats_global().marked_varint_count++; + debug::varint_stats_global().marked_varint_bytes++; + } + + *ptr = first_octet; + return 1; +} + +/*! + * Reads an integer encoded as a VarInt from a memory location. The decoding is implemented as a + * loop with non intrinsic operations. + * + * @tparam Int The type of integer to decode. + * @param ptr The pointer to the memory location to read the integer from. + * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer + * occupied at the memory location. + */ +template +[[nodiscard]] std::pair varint_decode_general(const std::uint8_t *ptr) { + Int result = 0; + std::size_t shift = 0; + std::size_t position = 0; + + while (true) { + const std::uint8_t byte = ptr[position++]; + + if ((byte & 0b10000000) == 0) { + result |= static_cast(byte) << shift; + break; + } else { + result |= static_cast(byte & 0b01111111) << shift; + } + + shift += 7; + } + + return std::make_pair(result, position); +} + +/*! + * Reads an integer encoded as a VarInt from a memory location. + * + * @tparam Int The type of integer to decode. + * @param ptr The pointer to the memory location to read the integer from. + * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer + * occupied at the memory location. + */ +template +[[nodiscard]] std::pair varint_decode(const std::uint8_t *ptr) { + return varint_decode_general(ptr); +} + +#ifdef KAMINPAR_COMPRESSION_FAST_DECODING +/*! + * Reads a 32-bit integer encoded as a VarInt from a memory location. The decoding is implemented + * as an unrolled loop with intrinsic operations. + * + * @param ptr The pointer to the memory location to read the integer from. + * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer + * occupied at the memory location. + */ +template <> +inline std::pair varint_decode(const std::uint8_t *ptr) { + if ((ptr[0] & 0b10000000) == 0) { + const std::uint32_t result = *ptr & 0b01111111; + return std::make_pair(result, 1); + } + + if ((ptr[1] & 0b10000000) == 0) { + const std::uint32_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F7F); + return std::make_pair(result, 2); + } + + if ((ptr[2] & 0b10000000) == 0) { + const std::uint32_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F7F7F); + return std::make_pair(result, 3); + } + + if ((ptr[3] & 0b10000000) == 0) { + const std::uint32_t result = + _pext_u32(*reinterpret_cast(ptr), 0x7F7F7F7F); + return std::make_pair(result, 4); + } + + const std::uint32_t result = static_cast( + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F) + ); + return std::make_pair(result, 5); +} + +/*! + * Reads a 64-bit integer encoded as a VarInt from a memory location. The decoding is implemented + * as an unrolled loop with intrinsic operations. + * + * @param ptr The pointer to the memory location to read the integer from. + * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer + * occupied at the memory location. + */ +template <> +inline std::pair varint_decode(const std::uint8_t *ptr) { + if ((ptr[0] & 0b10000000) == 0) { + const std::uint64_t result = *ptr & 0b01111111; + return std::make_pair(result, 1); + } + + if ((ptr[1] & 0b10000000) == 0) { + const std::uint64_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F7F); + return std::make_pair(result, 2); + } + + if ((ptr[2] & 0b10000000) == 0) { + const std::uint64_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F7F7F); + return std::make_pair(result, 3); + } + + if ((ptr[3] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u32(*reinterpret_cast(ptr), 0x7F7F7F7F); + return std::make_pair(result, 4); + } + + if ((ptr[4] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F); + return std::make_pair(result, 5); + } + + if ((ptr[5] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F); + return std::make_pair(result, 6); + } + + if ((ptr[6] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F7F); + return std::make_pair(result, 7); + } + + if ((ptr[7] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F7F7F); + return std::make_pair(result, 8); + } + + if ((ptr[8] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F7F7F) | + (static_cast(ptr[8] & 0b01111111) << 56); + return std::make_pair(result, 9); + } + + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F7F7F) | + (static_cast(ptr[8] & 0b01111111) << 56) | + (static_cast(ptr[9]) << 63); + return std::make_pair(result, 10); +} +#endif + +/*! + * Reads an integer encoded as a signed VarInt from a memory location. The decoding is implemented + * as a loop with non intrinsic operations. + * + * @tparam Int The type of integer to decode. + * @param ptr The pointer to the memory location to read the integer from. + * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer + * occupied at the memory location. + */ +template +[[nodiscard]] std::pair signed_varint_decode_general(const std::uint8_t *ptr) { + const auto [unsigned_value, len] = varint_decode_general>(ptr); + return std::make_pair(zigzag_decode(unsigned_value), len); +} + +/*! + * Reads an integer encoded as a signed VarInt from a memory location. + * + * @tparam Int The type of integer to decode. + * @param ptr The pointer to the memory location to read the integer from. + * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer + * occupied at the memory location. + */ +template +[[nodiscard]] std::pair signed_varint_decode(const std::uint8_t *ptr) { + const auto [unsigned_value, len] = varint_decode>(ptr); + return std::make_pair(zigzag_decode(unsigned_value), len); +} + +/*! + * Reads an integer encoded as a marked VarInt from a memory location. The decoding is implemented + * as a loop with non intrinsic operations. + * + * @tparam Int The type of integer to decode. + * @param ptr The pointer to the memory location to read the integer from. + * @return A tuple consisting of the decoded integer, whether the markes is set and the number of + * bytes that the encoded integer occupied at the memory location. + */ +template +[[nodiscard]] std::tuple marked_varint_decode(const std::uint8_t *ptr) { + const std::uint8_t first_byte = *ptr; + const bool is_continuation_bit_set = (first_byte & 0b10000000) != 0; + const bool is_marker_set = (first_byte & 0b01000000) != 0; + + Int result = first_byte & 0b00111111; + std::size_t shift = 0; + std::size_t position = 1; + + if (is_continuation_bit_set) { + while (true) { + const std::uint8_t byte = ptr[position++]; + + if ((byte & 0b10000000) == 0) { + result |= static_cast(byte) << (shift + 6); + break; + } else { + result |= static_cast(byte & 0b01111111) << (shift + 6); + } + + shift += 7; + } + } + + return std::make_tuple(result, is_marker_set, position); +} + +#ifdef KAMINPAR_COMPRESSION_FAST_DECODING +/*! + * Reads a 32-bit integer encoded as a marked VarInt from a memory location. The decoding is + * implemented as an unrolled loop with intrinsic operations. + * + * @tparam Int The type of integer to decode. + * @param ptr The pointer to the memory location to read the integer from. + * @return A tuple consisting of the decoded integer, whether the markes is set and the number of + * bytes that the encoded integer occupied at the memory location. + */ +template <> +inline std::tuple +marked_varint_decode(const std::uint8_t *ptr) { + const bool is_marker_set = (*ptr & 0b01000000) != 0; + + if ((ptr[0] & 0b10000000) == 0) { + const std::uint32_t result = *ptr & 0b00111111; + return std::make_tuple(result, is_marker_set, 1); + } + + if ((ptr[1] & 0b10000000) == 0) { + const std::uint32_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F3F); + return std::make_tuple(result, is_marker_set, 2); + } + + if ((ptr[2] & 0b10000000) == 0) { + const std::uint32_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F7F3F); + return std::make_tuple(result, is_marker_set, 3); + } + + if ((ptr[3] & 0b10000000) == 0) { + const std::uint32_t result = + _pext_u32(*reinterpret_cast(ptr), 0x7F7F7F3F); + return std::make_tuple(result, is_marker_set, 4); + } + + const std::uint32_t result = static_cast( + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F3F) + ); + return std::make_tuple(result, is_marker_set, 5); +} + +/*! + * Reads a 64-bit integer encoded as a marked VarInt from a memory location. The decoding is + * implemented as an unrolled loop with intrinsic operations. + * + * @tparam Int The type of integer to decode. + * @param ptr The pointer to the memory location to read the integer from. + * @return A tuple consisting of the decoded integer, whether the markes is set and the number of + * bytes that the encoded integer occupied at the memory location. + */ +template <> +inline std::tuple +marked_varint_decode(const std::uint8_t *ptr) { + const bool is_marker_set = (*ptr & 0b01000000) != 0; + + if ((ptr[0] & 0b10000000) == 0) { + const std::uint64_t result = *ptr & 0b00111111; + return std::make_tuple(result, is_marker_set, 1); + } + + if ((ptr[1] & 0b10000000) == 0) { + const std::uint64_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F3F); + return std::make_tuple(result, is_marker_set, 2); + } + + if ((ptr[2] & 0b10000000) == 0) { + const std::uint64_t result = _pext_u32(*reinterpret_cast(ptr), 0x7F7F3F); + return std::make_tuple(result, is_marker_set, 3); + } + + if ((ptr[3] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u32(*reinterpret_cast(ptr), 0x7F7F7F3F); + return std::make_tuple(result, is_marker_set, 4); + } + + if ((ptr[4] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F3F); + return std::make_tuple(result, is_marker_set, 5); + } + + if ((ptr[5] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F3F); + return std::make_tuple(result, is_marker_set, 6); + } + + if ((ptr[6] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F3F); + return std::make_tuple(result, is_marker_set, 7); + } + + if ((ptr[7] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F7F3F); + return std::make_tuple(result, is_marker_set, 8); + } + + if ((ptr[8] & 0b10000000) == 0) { + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F7F3F) | + (static_cast(ptr[8] & 0b01111111) << 55); + return std::make_tuple(result, is_marker_set, 9); + } + + const std::uint64_t result = + _pext_u64(*reinterpret_cast(ptr), 0x7F7F7F7F7F7F7F3F) | + (static_cast(ptr[8] & 0b01111111) << 55) | + (static_cast(ptr[9]) << 62); + return std::make_tuple(result, is_marker_set, 10); +} +#endif + +} // namespace kaminpar diff --git a/kaminpar-common/varint_run_length_codec.h b/kaminpar-common/varint_run_length_codec.h new file mode 100644 index 00000000..6120bfb8 --- /dev/null +++ b/kaminpar-common/varint_run_length_codec.h @@ -0,0 +1,380 @@ +/******************************************************************************* + * Encoding and decoding methods for run-length VarInts. + * + * @file: varint_run_length_codec.h + * @author: Daniel Salwasser + * @date: 29.12.2023 + ******************************************************************************/ +#pragma once + +#include +#include +#include +#include + +namespace kaminpar { + +/*! + * An encoder for writing run-length VarInts. + * + * @tparam Int The type of integer to encode. + */ +template class VarIntRunLengthEncoder { + static_assert(sizeof(Int) == 4 || sizeof(Int) == 8); + +public: + static constexpr std::size_t kBufferSize = (sizeof(Int) == 4) ? 64 : 32; + + /*! + * Constructs a new VarIntRunLengthEncoder. + * + * @param ptr The pointer to the memory location where the encoded integers are written. + */ + VarIntRunLengthEncoder(std::uint8_t *ptr) : _ptr(ptr) {} + + /*! + * Encodes an integer. + * + * @param i The integer to encode. + * @return The number of bytes that the integer requires to be stored in encoded format. It + * includes the control byte if it is the first integer of a block. + */ + std::size_t add(Int i) { + std::uint8_t size = needed_bytes(i); + + if (_buffer.empty()) { + _buffered_size = size++; + } else if (_buffer.size() == kBufferSize || _buffered_size != size) { + flush(); + _buffered_size = size++; + } + + _buffer.push_back(i); + return size; + } + + /*! + * Writes the remaining integers added to the encoder which do not form a complete block to + * memory. + */ + void flush() { + if (_buffer.empty()) { + return; + } + + const std::uint8_t *begin = _ptr; + if constexpr (sizeof(Int) == 4) { + const std::uint8_t header = (static_cast(_buffer.size() - 1) << 2) | + ((_buffered_size - 1) & 0b00000011); + *_ptr++ = header; + } else if constexpr (sizeof(Int) == 8) { + const std::uint8_t header = (static_cast(_buffer.size() - 1) << 3) | + ((_buffered_size - 1) & 0b00000111); + *_ptr++ = header; + } + + for (Int value : _buffer) { + for (std::uint8_t i = 0; i < _buffered_size; ++i) { + *_ptr++ = static_cast(value); + value >>= 8; + } + } + + _buffer.clear(); + } + +private: + std::uint8_t *_ptr; + + std::uint8_t _buffered_size; + std::vector _buffer; + + std::uint8_t needed_bytes(Int i) const { + std::size_t len = 1; + + while (i > 0b11111111) { + i >>= 8; + len++; + } + + return len; + } +}; + +/*! + * A decoder for reading run-length VarInts. + * + * @tparam Int The type of integer to decode. + */ +template class VarIntRunLengthDecoder { + static_assert(sizeof(Int) == 4 || sizeof(Int) == 8); + +public: + /*! + * Constructs a new VarIntRunLengthDecoder. + * + * @param ptr The pointer to the memory location where the encoded integers are stored. + */ + VarIntRunLengthDecoder(const std::uint8_t *ptr) : _ptr(ptr) {} + + /*! + * Decodes the encoded integers. + * + * @param max_decoded The amount of integers to decode. + * @param l The function to be called with the decoded integers, i.e. the function has one + * parameter of type Int. + */ + template void decode(const std::size_t max_decoded, Lambda &&l) { + constexpr bool non_stoppable = std::is_void>::value; + + std::size_t decoded = 0; + while (decoded < max_decoded) { + const std::uint8_t run_header = *_ptr++; + + if constexpr (sizeof(Int) == 4) { + std::uint8_t run_length = (run_header >> 2) + 1; + const std::uint8_t run_size = (run_header & 0b00000011) + 1; + + decoded += run_length; + if (decoded > max_decoded) { + run_length -= decoded - max_decoded; + } + + if constexpr (non_stoppable) { + decode32(run_length, run_size, std::forward(l)); + } else { + const bool stop = decode32(run_length, run_size, std::forward(l)); + if (stop) { + return; + } + } + } else if constexpr (sizeof(Int) == 8) { + std::uint8_t run_length = (run_header >> 3) + 1; + const std::uint8_t run_size = (run_header & 0b00000111) + 1; + + decoded += run_length; + if (decoded > max_decoded) { + run_length -= decoded - max_decoded; + } + + if constexpr (non_stoppable) { + decode64(run_length, run_size, std::forward(l)); + } else { + const bool stop = decode64(run_length, run_size, std::forward(l)); + if (stop) { + return; + } + } + } + } + } + +private: + const std::uint8_t *_ptr; + + template + bool decode32(const std::uint8_t run_length, const std::uint8_t run_size, Lambda &&l) { + constexpr bool non_stoppable = std::is_void>::value; + + switch (run_size) { + case 1: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint32_t value = static_cast(*_ptr); + _ptr += 1; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 2: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint32_t value = *((std::uint16_t *)_ptr); + _ptr += 2; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 3: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint32_t value = *((std::uint32_t *)_ptr) & 0xFFFFFF; + _ptr += 3; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 4: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint32_t value = *((std::uint32_t *)_ptr); + _ptr += 4; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + default: + throw std::runtime_error("unexpected run size"); + } + + return false; + } + + template + bool decode64(const std::uint8_t run_length, const std::uint8_t run_size, Lambda &&l) { + constexpr bool non_stoppable = std::is_void>::value; + + switch (run_size) { + case 1: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = static_cast(*_ptr); + _ptr += 1; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 2: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = *((std::uint16_t *)_ptr); + _ptr += 2; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 3: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = *((std::uint32_t *)_ptr) & 0xFFFFFF; + _ptr += 3; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 4: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = *((std::uint32_t *)_ptr); + _ptr += 4; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 5: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFF; + _ptr += 5; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 6: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFFFF; + _ptr += 6; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 7: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFFFFFF; + _ptr += 7; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + case 8: + for (std::uint8_t i = 0; i < run_length; ++i) { + std::uint64_t value = *((std::uint64_t *)_ptr); + _ptr += 8; + + if constexpr (non_stoppable) { + l(value); + } else { + const bool stop = l(value); + if (stop) { + return true; + } + } + } + break; + default: + throw std::runtime_error("unexpected run size"); + } + + return false; + } +}; + +}; // namespace kaminpar diff --git a/kaminpar-common/varint_stream_codec.h b/kaminpar-common/varint_stream_codec.h new file mode 100644 index 00000000..f6db0742 --- /dev/null +++ b/kaminpar-common/varint_stream_codec.h @@ -0,0 +1,307 @@ +/******************************************************************************* + * Encoding and decoding methods for the StreamVByte codec. + * + * @file: varint_stream_codec.h + * @author: Daniel Salwasser + * @date: 29.12.2023 + ******************************************************************************/ +#pragma once + +#include +#include + +#include + +#include "kaminpar-common/constexpr_utils.h" +#include "kaminpar-common/varint_codec.h" + +namespace kaminpar { + +/*! + * An encoder for writing variable length integers with the StreamVByte codec. + * + * @tparam Int The type of integer to encode. + */ +template class VarIntStreamEncoder { + static_assert(sizeof(Int) == 4); + +public: + /*! + * Constructs a new VarIntStreamEncoder. + * + * @param ptr The pointer to the memory location where the encoded integers are written. + * @param count The amount of integers to encode. + */ + VarIntStreamEncoder(std::uint8_t *ptr, std::size_t count) + : _control_bytes_ptr(ptr), + _data_ptr(ptr + count / 4 + ((count % 4) != 0)), + _count(count), + _buffered(0) {} + + /*! + * Encodes an integer. + * + * @param i The integer to encode. + * @return The number of bytes that the integer requires to be stored in encoded format. It + * includes the control byte if it is the last integer of a block. + */ + std::size_t add(Int i) { + if (_buffered == 3) { + _buffer[3] = i; + write_stream(); + + _buffered = 0; + return needed_bytes(i); + } + + _buffer[_buffered] = i; + return needed_bytes(i) + (_buffered++ == 0); + } + + /*! + * Writes the remaining integers added to the encoder which do not form a complete block to + * memory. + */ + void flush() { + if (_buffered == 0) { + return; + } + + const std::uint8_t control_byte = + ((needed_bytes(_buffer[3]) - 1) << 6) | (((needed_bytes(_buffer[2]) - 1) & 0b11) << 4) | + (((needed_bytes(_buffer[1]) - 1) & 0b11) << 2) | ((needed_bytes(_buffer[0]) - 1) & 0b11); + *_control_bytes_ptr++ = control_byte; + + for (std::size_t i = 0; i < _buffered; ++i) { + Int value = _buffer[i]; + do { + *_data_ptr++ = static_cast(value); + value >>= 8; + } while (value > 0); + } + } + +private: + std::uint8_t *_control_bytes_ptr; + std::uint8_t *_data_ptr; + const std::size_t _count; + + std::size_t _buffered; + std::array _buffer; + + void write_stream() { + const std::uint8_t control_byte = + ((needed_bytes(_buffer[3]) - 1) << 6) | (((needed_bytes(_buffer[2]) - 1) & 0b11) << 4) | + (((needed_bytes(_buffer[1]) - 1) & 0b11) << 2) | ((needed_bytes(_buffer[0]) - 1) & 0b11); + *_control_bytes_ptr++ = control_byte; + + for (Int value : _buffer) { + do { + *_data_ptr++ = static_cast(value); + value >>= 8; + } while (value > 0); + } + } + + std::uint8_t needed_bytes(Int i) const { + std::size_t len = 1; + + while (i > 0b11111111) { + i >>= 8; + len++; + } + + return len; + } +}; + +/*! + * A decoder for reading variable length integers stored with the StreamVByte codec. + * + * @tparam Int The type of integer to decode. + */ +template class VarIntStreamDecoder { + static_assert(sizeof(Int) == 4); + + static constexpr std::array create_length_table() { + std::array length_table{}; + + constexpr_for<256>([&](const std::uint8_t control_byte) { + length_table[control_byte] = 0; + + constexpr_for<4>([&](const std::uint8_t i) { + const std::uint8_t length = ((control_byte >> (2 * i)) & 0b11) + 1; + length_table[control_byte] += length; + }); + }); + + return length_table; + } + + static constexpr std::array, 256> create_shuffle_table() { + std::array, 256> shuffle_table{}; + + constexpr_for<256>([&](const std::uint8_t control_byte) { + std::uint8_t byte = 0; + std::uint8_t pos = 0; + + constexpr_for<4>([&](const std::uint8_t i) { + std::uint8_t c = (control_byte >> (2 * i)) & 0b11; + + std::uint8_t j = 0; + while (j <= c) { + shuffle_table[control_byte][pos++] = byte++; + j += 1; + } + + while (j < 4) { + shuffle_table[control_byte][pos++] = 0b11111111; + j += 1; + } + }); + }); + + return shuffle_table; + } + + static const constexpr std::array kLengthTable = create_length_table(); + + static const constexpr std::array, 256> kShuffleTable = + create_shuffle_table(); + +public: + /*! + * Constructs a new VarIntStreamDecoder. + * + * @param ptr The pointer to the memory location where the encoded integers are stored. + * @param count The amount of integers that are stored at the memory location. + */ + VarIntStreamDecoder(const std::uint8_t *ptr, const std::size_t count) + : _control_bytes_ptr(ptr), + _control_bytes(count / 4), + _data_ptr(ptr + _control_bytes + ((count % 4) != 0)), + _count(count) {} + + /*! + * Decodes the encoded integers. + * + * @param max_count The amount of integers to decode, it has to be less then the amount of + * integers stored that are stored. + * @param l The function to be called with the decoded integers, i.e. the function has one + * parameter of type Int. + */ + template void decode(const std::size_t max_count, Lambda &&l) { + constexpr bool non_stoppable = std::is_void>::value; + + // max_count = std::min(max_count, _count); + + const std::size_t control_bytes = max_count / 4; + for (std::size_t i = 0; i < control_bytes; ++i) { + const std::uint8_t control_byte = _control_bytes_ptr[i]; + const std::uint8_t length = kLengthTable[control_byte]; + + __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr); + _data_ptr += length; + + const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data(); + data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask); + + if constexpr (non_stoppable) { + l(_mm_extract_epi32(data, 0)); + l(_mm_extract_epi32(data, 1)); + l(_mm_extract_epi32(data, 2)); + l(_mm_extract_epi32(data, 3)); + } else { + if (l(_mm_extract_epi32(data, 0))) { + return; + } + + if (l(_mm_extract_epi32(data, 1))) { + return; + } + + if (l(_mm_extract_epi32(data, 2))) { + return; + } + + if (l(_mm_extract_epi32(data, 3))) { + return; + } + } + } + + switch (max_count % 4) { + case 1: { + const std::uint8_t control_byte = _control_bytes_ptr[control_bytes]; + const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data(); + + __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr); + data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask); + + if constexpr (non_stoppable) { + l(_mm_extract_epi32(data, 0)); + } else { + if (l(_mm_extract_epi32(data, 0))) { + return; + } + } + break; + } + case 2: { + const std::uint8_t control_byte = _control_bytes_ptr[control_bytes]; + const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data(); + + __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr); + data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask); + + if constexpr (non_stoppable) { + l(_mm_extract_epi32(data, 0)); + l(_mm_extract_epi32(data, 1)); + } else { + if (l(_mm_extract_epi32(data, 0))) { + return; + } + + if (l(_mm_extract_epi32(data, 1))) { + return; + } + } + break; + } + case 3: { + const std::uint8_t control_byte = _control_bytes_ptr[control_bytes]; + const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data(); + + __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr); + data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask); + + if constexpr (non_stoppable) { + l(_mm_extract_epi32(data, 0)); + l(_mm_extract_epi32(data, 1)); + l(_mm_extract_epi32(data, 2)); + } else { + if (l(_mm_extract_epi32(data, 0))) { + return; + } + + if (l(_mm_extract_epi32(data, 1))) { + return; + } + + if (l(_mm_extract_epi32(data, 2))) { + return; + } + } + break; + } + } + } + +private: + const std::uint8_t *_control_bytes_ptr; + const std::size_t _control_bytes; + const std::uint8_t *_data_ptr; + const std::size_t _count; +}; + +} // namespace kaminpar diff --git a/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc b/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc index 6d16afae..1cf875bf 100644 --- a/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc +++ b/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc @@ -9,14 +9,13 @@ #include +#include "kaminpar-mpi/sparse_alltoall.h" + #include "kaminpar-dist/datastructures/distributed_graph.h" #include "kaminpar-dist/datastructures/growt.h" +#include "kaminpar-dist/distributed_label_propagation.h" #include "kaminpar-dist/graphutils/communication.h" - -#include "kaminpar-shm/label_propagation.h" - -#include "kaminpar-common/datastructures/fast_reset_array.h" -#include "kaminpar-common/math.h" +#include "kaminpar-dist/timer.h" namespace kaminpar::dist { namespace { @@ -49,8 +48,8 @@ struct UnorderedRatingMap { }; struct GlobalLPClusteringConfig : public LabelPropagationConfig { - using Graph = DistributedGraph; using RatingMap = ::kaminpar::RatingMap; + using ClusterID = GlobalNodeID; using ClusterWeight = GlobalNodeWeight; @@ -70,8 +69,6 @@ class GlobalLPClusteringImpl final using ClusterBase = NonatomicOwnedClusterVector; using WeightDeltaMap = growt::GlobalNodeIDMap; - struct Statistics {}; - public: explicit GlobalLPClusteringImpl(const Context &ctx) : ClusterBase{ctx.partition.graph->total_n}, @@ -547,9 +544,7 @@ class GlobalLPClusteringImpl final from, to, [&](const NodeID lnode) { return _changed_label[lnode] != kInvalidGlobalNodeID; }, - [&](const NodeID lnode) -> ChangedLabelMessage { - return {lnode, cluster(lnode)}; - }, + [&](const NodeID lnode) -> ChangedLabelMessage { return {lnode, cluster(lnode)}; }, [&](const auto &buffer, const PEID owner) { tbb::parallel_for(tbb::blocked_range(0, buffer.size()), [&](const auto &r) { auto &weight_delta_handle = _weight_delta_handles_ets.local(); diff --git a/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc b/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc index 87a75cbb..3a4e279a 100644 --- a/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc +++ b/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc @@ -8,13 +8,13 @@ ******************************************************************************/ #include "kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.h" -#include "kaminpar-shm/label_propagation.h" +#include "kaminpar-dist/distributed_label_propagation.h" namespace kaminpar::dist { struct LocalLPClusteringConfig : public LabelPropagationConfig { - using Graph = DistributedGraph; using ClusterID = NodeID; using ClusterWeight = NodeWeight; + static constexpr bool kTrackClusterCount = false; static constexpr bool kUseTwoHopClustering = true; }; diff --git a/kaminpar-dist/coarsening/coarsener.cc b/kaminpar-dist/coarsening/coarsener.cc index 5c14b27e..1c6ca95f 100644 --- a/kaminpar-dist/coarsening/coarsener.cc +++ b/kaminpar-dist/coarsening/coarsener.cc @@ -9,13 +9,11 @@ #include "kaminpar-dist/coarsening/contraction/cluster_contraction.h" #include "kaminpar-dist/coarsening/contraction/local_cluster_contraction.h" -#include "kaminpar-dist/context.h" #include "kaminpar-dist/datastructures/distributed_graph.h" #include "kaminpar-dist/datastructures/distributed_partitioned_graph.h" #include "kaminpar-dist/factories.h" -#include "kaminpar-shm/context.h" -#include "kaminpar-shm/partition_utils.h" +#include "kaminpar-shm/coarsening/max_cluster_weights.h" namespace kaminpar::dist { SET_DEBUG(false); @@ -194,11 +192,11 @@ const DistributedGraph *Coarsener::nth_coarsest(const std::size_t n) const { GlobalNodeWeight Coarsener::max_cluster_weight() const { const auto *graph = coarsest(); - return shm::compute_max_cluster_weight( + return shm::compute_max_cluster_weight( _input_ctx.coarsening, + _input_ctx.partition, graph->global_n(), - graph->global_total_node_weight(), - _input_ctx.partition + graph->global_total_node_weight() ); } } // namespace kaminpar::dist diff --git a/kaminpar-dist/coarsening/contraction/cluster_contraction.cc b/kaminpar-dist/coarsening/contraction/cluster_contraction.cc index 79894cc1..f11ac8f4 100644 --- a/kaminpar-dist/coarsening/contraction/cluster_contraction.cc +++ b/kaminpar-dist/coarsening/contraction/cluster_contraction.cc @@ -104,7 +104,8 @@ find_nonlocal_nodes(const DistributedGraph &graph, const GlobalClustering &lnode const GlobalNodeID gcluster = lnode_to_gcluster[lnode]; if (!graph.is_owned_global_node(gcluster)) { nonlocal_nodes[node_position_buffer[lnode]] = { - .u = gcluster, .weight = graph.node_weight(lnode)}; + .u = gcluster, .weight = graph.node_weight(lnode) + }; } }); @@ -224,9 +225,7 @@ void update_ghost_node_weights(DistributedGraph &graph) { mpi::graph::sparse_alltoall_interface_to_pe( graph, - [&](const NodeID u) -> Message { - return {u, graph.node_weight(u)}; - }, + [&](const NodeID u) -> Message { return {u, graph.node_weight(u)}; }, [&](const auto buffer, const PEID pe) { tbb::parallel_for(0, buffer.size(), [&](const std::size_t i) { const auto &[local_node_on_other_pe, weight] = buffer[i]; @@ -424,7 +423,8 @@ MigrationResult migrate_elements( .sendcounts = std::move(sendcounts), .sdispls = std::move(sdispls), .recvcounts = std::move(recvcounts), - .rdispls = std::move(rdispls)}; + .rdispls = std::move(rdispls) + }; } MigrationResult @@ -816,9 +816,7 @@ void rebalance_cluster_placement( }; mpi::graph::sparse_alltoall_interface_to_pe( graph, - [&](const NodeID lnode) -> Message { - return {lnode, lnode_to_gcluster[lnode]}; - }, + [&](const NodeID lnode) -> Message { return {lnode, lnode_to_gcluster[lnode]}; }, [&](const auto buffer, const PEID pe) { tbb::parallel_for(0, buffer.size(), [&](const std::size_t i) { const auto &[their_lnode, new_gcluster] = buffer[i]; @@ -862,8 +860,8 @@ bool validate_clustering(const DistributedGraph &graph, const GlobalClustering & const NodeID lnode = graph.global_to_local_node(gnode); if (lnode_to_gcluster[lnode] != gcluster) { LOG_WARNING << "Inconsistent cluster for local node " << lnode - << " (ghost node, global node ID " << gnode << "): " - << "the node is owned by PE " << pe + << " (ghost node, global node ID " << gnode + << "): " << "the node is owned by PE " << pe << ", which assigned the node to cluster " << gcluster << ", but our ghost node is assigned to cluster " << lnode_to_gcluster[lnode] << "; aborting"; @@ -1322,7 +1320,9 @@ ContractionResult contract_clustering( // Finally, build coarse graph START_TIMER("Construct coarse graph"); auto all_buffered_nodes = - ts_navigable_list::combine(edge_buffer_ets); + ts_navigable_list::combine( + edge_buffer_ets + ); tbb::parallel_for(0, c_n, [&](const NodeID i) { const auto &marker = all_buffered_nodes[i]; @@ -1456,9 +1456,7 @@ DistributedPartitionedGraph project_partition( mpi::graph::sparse_alltoall_interface_to_pe( graph, - [&](const NodeID lnode) -> GhostNodeLabel { - return {lnode, partition[lnode]}; - }, + [&](const NodeID lnode) -> GhostNodeLabel { return {lnode, partition[lnode]}; }, [&](const auto buffer, const PEID pe) { tbb::parallel_for(0, buffer.size(), [&](const std::size_t i) { const auto &[sender_lnode, block] = buffer[i]; diff --git a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc index 3f21e6ae..12266c35 100644 --- a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc +++ b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc @@ -272,7 +272,8 @@ Result contract_local_clustering( std::move(c_ghost_to_global), std::move(c_global_to_ghost), false, - graph.communicator()}; + graph.communicator() + }; return {std::move(c_graph), std::move(mapping), std::move(m_ctx)}; } diff --git a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h index fbe98301..8925c519 100644 --- a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h +++ b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h @@ -24,7 +24,7 @@ struct MemoryContext { scalable_vector buckets; scalable_vector> buckets_index; scalable_vector> leader_mapping; - scalable_vector> all_buffered_nodes; + StaticArray> all_buffered_nodes; }; struct Result { diff --git a/kaminpar-dist/datastructures/distributed_graph.cc b/kaminpar-dist/datastructures/distributed_graph.cc index c0dc1e15..1a7d07fd 100644 --- a/kaminpar-dist/datastructures/distributed_graph.cc +++ b/kaminpar-dist/datastructures/distributed_graph.cc @@ -230,7 +230,7 @@ void print_local_graph_stats(const DistributedGraph &graph) { std::fill(buckets.begin(), buckets.end(), 0); EdgeID local_m = 0, nonlocal_m = 0; - EdgeID min_deg = std::numeric_limits::max(), max_deg = 0; + NodeID min_deg = std::numeric_limits::max(), max_deg = 0; for (NodeID u = 0; u < graph.n(); ++u) { for (const auto [e, v] : graph.neighbors(u)) { if (graph.is_owned_node(v)) { diff --git a/kaminpar-dist/datastructures/ghost_node_mapper.h b/kaminpar-dist/datastructures/ghost_node_mapper.h index 09d5d4f2..db259a69 100644 --- a/kaminpar-dist/datastructures/ghost_node_mapper.h +++ b/kaminpar-dist/datastructures/ghost_node_mapper.h @@ -9,14 +9,13 @@ #include -#include "kaminpar-mpi/wrapper.h" - #include "kaminpar-dist/datastructures/growt.h" #include "kaminpar-dist/dkaminpar.h" -#include "kaminpar-dist/logger.h" #include "kaminpar-common/assert.h" #include "kaminpar-common/datastructures/static_array.h" +#include "kaminpar-common/logger.h" +#include "kaminpar-common/parallel/atomic.h" namespace kaminpar::dist::graph { class GhostNodeMapper { @@ -87,7 +86,8 @@ class GhostNodeMapper { return { .global_to_ghost = std::move(global_to_ghost), .ghost_to_global = std::move(ghost_to_global), - .ghost_owner = std::move(ghost_owner)}; + .ghost_owner = std::move(ghost_owner) + }; } private: diff --git a/kaminpar-dist/distributed_label_propagation.h b/kaminpar-dist/distributed_label_propagation.h new file mode 100644 index 00000000..9e181ced --- /dev/null +++ b/kaminpar-dist/distributed_label_propagation.h @@ -0,0 +1,1310 @@ +/******************************************************************************* + * Generic implementation of parallel label propagation. + * + * @file: parallel_label_propagation.h + * @author: Daniel Seemaier + * @date: 21.09.2021 + ******************************************************************************/ +#pragma once + +#include +#include + +#include +#include +#include +#include + +#include "kaminpar-dist/datastructures/distributed_graph.h" + +#include "kaminpar-common/assert.h" +#include "kaminpar-common/datastructures/dynamic_map.h" +#include "kaminpar-common/datastructures/rating_map.h" +#include "kaminpar-common/datastructures/scalable_vector.h" +#include "kaminpar-common/logger.h" +#include "kaminpar-common/parallel/atomic.h" +#include "kaminpar-common/random.h" +#include "kaminpar-common/tags.h" + +namespace kaminpar::dist { +struct LabelPropagationConfig { + using Graph = DistributedGraph; + + // Data structure used to accumulate edge weights for gain value calculation + using RatingMap = ::kaminpar::RatingMap>; + + // Data type for cluster IDs and weights + using ClusterID = tag::Mandatory; + using ClusterWeight = tag::Mandatory; + + // Approx. number of edges per work unit + static constexpr shm::NodeID kMinChunkSize = 1024; + + // Nodes per permutation unit: when iterating over nodes in a chunk, we divide + // them into permutation units, iterate over permutation orders in random + // order, and iterate over nodes inside a permutation unit in random order. + static constexpr shm::NodeID kPermutationSize = 64; + + // When randomizing the node order inside a permutation unit, we pick a random + // permutation from a pool of permutations. This constant determines the pool + // size. + static constexpr std::size_t kNumberOfNodePermutations = 64; + + // If true, we count the number of empty clusters + static constexpr bool kTrackClusterCount = false; + + // If true, match singleton clusters in 2-hop distance + static constexpr bool kUseTwoHopClustering = false; + + static constexpr bool kUseActualGain = false; + + static constexpr bool kUseActiveSetStrategy = true; + static constexpr bool kUseLocalActiveSetStrategy = false; +}; + +/*! + * Generic implementation of parallel label propagation. To use, inherit from + * this class and implement all mandatory template functions. + * + * @tparam Derived Derived class for static polymorphism. + * @tparam Config Algorithmic configuration and data types. + */ +template class LabelPropagation { + static_assert(std::is_base_of_v); + + SET_DEBUG(false); + SET_STATISTICS_FROM_GLOBAL(); + +protected: + using RatingMap = typename Config::RatingMap; + using Graph = typename Config::Graph; + using NodeID = typename Graph::NodeID; + using NodeWeight = typename Graph::NodeWeight; + using EdgeID = typename Graph::EdgeID; + using EdgeWeight = typename Graph::EdgeWeight; + using ClusterID = typename Config::ClusterID; + using ClusterWeight = typename Config::ClusterWeight; + +public: + void set_max_degree(const NodeID max_degree) { + _max_degree = max_degree; + } + [[nodiscard]] NodeID max_degree() const { + return _max_degree; + } + + void set_max_num_neighbors(const ClusterID max_num_neighbors) { + _max_num_neighbors = max_num_neighbors; + } + [[nodiscard]] ClusterID max_num_neighbors() const { + return _max_num_neighbors; + } + + void set_desired_num_clusters(const ClusterID desired_num_clusters) { + _desired_num_clusters = desired_num_clusters; + } + [[nodiscard]] ClusterID desired_num_clusters() const { + return _desired_num_clusters; + } + + [[nodiscard]] EdgeWeight expected_total_gain() const { + return _expected_total_gain; + } + +protected: + /*! + * (Re)allocates memory to run label propagation on a graph with \c num_nodes + * nodes. + * @param num_nodes Number of nodes in the graph. + */ + void allocate(const NodeID num_nodes, const ClusterID num_clusters) { + allocate(num_nodes, num_nodes, num_clusters); + } + + /*! + * (Re)allocates memory to run label propagation on a graph with \c num_nodes + * nodes in total, but a clustering is only computed for the first \c + * num_active_nodes nodes. + * + * This is mostly useful for distributed graphs where ghost nodes are always + * inactive. + * + * @param num_nodes Total number of nodes in the graph, i.e., neighbors of + * active nodes have an ID less than this. + * @param num_active_nodes Number of nodes for which a cluster label is + * computed. + */ + void allocate(const NodeID num_nodes, const NodeID num_active_nodes, const NodeID num_clusters) { + if (_num_nodes < num_nodes) { + if constexpr (Config::kUseLocalActiveSetStrategy) { + _active.resize(num_nodes); + } + _num_nodes = num_nodes; + } + + if (_num_active_nodes < num_active_nodes) { + if constexpr (Config::kUseActiveSetStrategy) { + _active.resize(num_active_nodes); + } + if constexpr (Config::kUseTwoHopClustering) { + _favored_clusters.resize(num_active_nodes); + } + _num_active_nodes = num_active_nodes; + } + if (_num_clusters < num_clusters) { + for (auto &rating_map : _rating_map_ets) { + rating_map.change_max_size(num_clusters); + } + _num_clusters = num_clusters; + } + } + + /*! + * Initialize label propagation. Must be called after \c allocate(). + * @param graph Graph for label propagation. + * @param num_clusters Number of different clusters the nodes are placed in + * initially. When using label propagation as refinement graphutils, this is + * usually the number of blocks. When using as for clustering, it is usually + * the number of nodes. + */ + void initialize(const Graph *graph, const ClusterID num_clusters) { + KASSERT( + graph->n() == 0 || (_num_nodes > 0u && _num_active_nodes > 0u), + "you must call allocate() before initialize()" + ); + + _graph = graph; + _initial_num_clusters = num_clusters; + _current_num_clusters = num_clusters; + reset_state(); + } + + /*! + * Determines whether we should stop label propagation because the number of + * non-empty clusters has been reduced sufficiently. + * @return Whether label propagation should be stopped now. + */ + bool should_stop() { + if (Config::kTrackClusterCount) { + return _current_num_clusters <= _desired_num_clusters; + } + return false; + } + + /*! + * Move a single node to a new cluster. + * + * @param u The node that is moved. + * @param local_rand Thread-local \c Random object. + * @param local_rating_map Thread-local rating map for gain computation. + * @return Pair with: whether the node was moved to another cluster, whether + * the previous cluster is now empty. + */ + template + std::pair + handle_node(const NodeID u, Random &local_rand, LocalRatingMap &local_rating_map) { + if (derived_skip_node(u)) { + return {false, false}; + } + + const NodeWeight u_weight = _graph->node_weight(u); + const ClusterID u_cluster = derived_cluster(u); + const auto [new_cluster, new_gain] = + find_best_cluster(u, u_weight, u_cluster, local_rand, local_rating_map); + + if (derived_cluster(u) != new_cluster) { + if (derived_move_cluster_weight( + u_cluster, new_cluster, u_weight, derived_max_cluster_weight(new_cluster) + )) { + derived_move_node(u, new_cluster); + activate_neighbors(u); + IFSTATS(_expected_total_gain += new_gain); + + const bool decrement_cluster_count = + Config::kTrackClusterCount && derived_cluster_weight(u_cluster) == 0; + // do not update _current_num_clusters here to avoid fetch_add() + return {true, decrement_cluster_count}; // did move, did reduce nonempty + // cluster count? + } + } + + // did not move, did not reduce cluster count + return {false, false}; + } + + struct ClusterSelectionState { + Random &local_rand; + NodeID u; + NodeWeight u_weight; + ClusterID initial_cluster; + ClusterWeight initial_cluster_weight; + ClusterID best_cluster; + EdgeWeight best_gain; + ClusterWeight best_cluster_weight; + ClusterID current_cluster; + EdgeWeight current_gain; + ClusterWeight current_cluster_weight; + }; + + /*! + * Computes the best feasible cluster for a node. + * + * @param u The node for which the cluster is computed. + * @param u_weight The weight of the node. + * @param u_cluster The current cluster of the node. + * @param local_rand Thread-local \c Random object. + * @param local_rating_map Thread-local rating map to compute gain values. + * @return Pair with: new cluster of the node, gain value for the move to the + * new cluster. + */ + template + std::pair find_best_cluster( + const NodeID u, + const NodeWeight u_weight, + const ClusterID u_cluster, + Random &local_rand, + LocalRatingMap &local_rating_map + ) { + auto action = [&](auto &map) { + const ClusterWeight initial_cluster_weight = derived_cluster_weight(u_cluster); + ClusterSelectionState state{ + .local_rand = local_rand, + .u = u, + .u_weight = u_weight, + .initial_cluster = u_cluster, + .initial_cluster_weight = initial_cluster_weight, + .best_cluster = u_cluster, + .best_gain = 0, + .best_cluster_weight = initial_cluster_weight, + .current_cluster = 0, + .current_gain = 0, + .current_cluster_weight = 0, + }; + + bool is_interface_node = false; + + auto add_to_rating_map = [&](const EdgeID e, const NodeID v) { + if (derived_accept_neighbor(u, v)) { + const ClusterID v_cluster = derived_cluster(v); + const EdgeWeight rating = _graph->edge_weight(e); + map[v_cluster] += rating; + if constexpr (Config::kUseLocalActiveSetStrategy) { + is_interface_node |= v >= _num_active_nodes; + } + } + }; + + const EdgeID from = _graph->first_edge(u); + const EdgeID to = from + std::min(_graph->degree(u), _max_num_neighbors); + for (EdgeID e = from; e < to; ++e) { + add_to_rating_map(e, _graph->edge_target(e)); + } + + if constexpr (Config::kUseLocalActiveSetStrategy) { + if (!is_interface_node) { + _active[u] = 0; + } + } + if constexpr (Config::kUseActiveSetStrategy) { + _active[u] = 0; + } + + // After LP, we might want to use 2-hop clustering to merge nodes that + // could not find any cluster to join for this, we store a favored cluster + // for each node u if: + // (1) we actually use 2-hop clustering + // (2) u is still in a singleton cluster (weight of node == weight of cluster) + // (3) the cluster is light (at most half full) + ClusterID favored_cluster = u_cluster; + const bool store_favored_cluster = + Config::kUseTwoHopClustering && u_weight == initial_cluster_weight && + initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2; + + const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0; + + for (const auto [cluster, rating] : map.entries()) { + state.current_cluster = cluster; + state.current_gain = rating - gain_delta; + state.current_cluster_weight = derived_cluster_weight(cluster); + + if (store_favored_cluster && state.current_gain > state.best_gain) { + favored_cluster = state.current_cluster; + } + + if (derived_accept_cluster(state)) { + state.best_cluster = state.current_cluster; + state.best_cluster_weight = state.current_cluster_weight; + state.best_gain = state.current_gain; + } + } + + // if we couldn't join any cluster, we store the favored cluster + if (store_favored_cluster && state.best_cluster == state.initial_cluster) { + _favored_clusters[u] = favored_cluster; + } + + const EdgeWeight actual_gain = IFSTATS(state.best_gain - map[state.initial_cluster]); + map.clear(); + return std::make_pair(state.best_cluster, actual_gain); + }; + + const auto [best_cluster, gain] = local_rating_map.execute( + std::min(_graph->degree(u), _initial_num_clusters), action + ); + + return {best_cluster, gain}; + } + + /*! + * Flags neighbors of a node that has been moved as active. + * + * @param u Node that was moved. + */ + void activate_neighbors(const NodeID u) { + for (const NodeID v : _graph->adjacent_nodes(u)) { + // call derived_activate_neighbor() even if we do not use the active set + // strategy since the function might have side effects; the compiler + // should remove it if it does not side effects + if (derived_activate_neighbor(v)) { + if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) { + _active[v].store(1, std::memory_order_relaxed); + } + } + } + } + + void match_isolated_nodes( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + handle_isolated_nodes_impl(from, to); + } + + void cluster_isolated_nodes( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + handle_isolated_nodes_impl(from, to); + } + + template + void handle_isolated_nodes_impl( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + constexpr ClusterID kInvalidClusterID = std::numeric_limits::max(); + tbb::enumerable_thread_specific current_cluster_ets(kInvalidClusterID); + + tbb::parallel_for( + tbb::blocked_range(from, std::min(_graph->n(), to)), + [&](tbb::blocked_range r) { + ClusterID cluster = current_cluster_ets.local(); + + for (NodeID u = r.begin(); u != r.end(); ++u) { + if (_graph->degree(u) == 0) { + const ClusterID cu = derived_cluster(u); + + if (cluster != kInvalidClusterID && + derived_move_cluster_weight( + cu, cluster, derived_cluster_weight(cu), derived_max_cluster_weight(cluster) + )) { + derived_move_node(u, cluster); + if constexpr (match) { + cluster = kInvalidClusterID; + } + } else { + cluster = cu; + } + } + } + + current_cluster_ets.local() = cluster; + } + ); + } + + void match_two_hop_nodes_threadwise( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + handle_two_hop_nodes_threadwise_impl(from, to); + } + + void cluster_two_hop_nodes_threadwise( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + handle_two_hop_nodes_threadwise_impl(from, to); + } + + template + void handle_two_hop_nodes_threadwise_impl( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + static_assert(Config::kUseTwoHopClustering, "2-hop clustering is disabled"); + + tbb::enumerable_thread_specific> matching_map_ets; + + auto is_considered_for_two_hop_clustering = [&](const NodeID u) { + // Skip nodes not considered for two-hop clustering + if (_graph->degree(u) == 0) { + // Not considered: isolated node + return false; + } else if (u != derived_cluster(u)) { + // Not considered: joined another cluster + return false; + } else { + // If u did not join another cluster, there could still be other nodes that joined this + // node's cluster: find out by checking the cluster weight + const ClusterWeight current_weight = derived_cluster_weight(u); + if (current_weight > derived_max_cluster_weight(u) / 2 || + current_weight != derived_initial_cluster_weight(u)) { + // Not considered: not a singleton cluster; or its weight is too heavy + return false; + } + } + + return true; + }; + + auto handle_node = [&](DynamicFlatMap &matching_map, const NodeID u) { + ClusterID &rep_key = matching_map[_favored_clusters[u]]; + + if (rep_key == 0) { + rep_key = u + 1; + } else { + const ClusterID rep = rep_key - 1; + + const bool could_move_u_to_rep = derived_move_cluster_weight( + u, rep, derived_cluster_weight(u), derived_max_cluster_weight(rep) + ); + + if constexpr (match) { + KASSERT(could_move_u_to_rep); + derived_move_node(u, rep); + rep_key = 0; + } else { + if (could_move_u_to_rep) { + derived_move_node(u, rep); + } else { + rep_key = u + 1; + } + } + } + }; + + tbb::parallel_for( + tbb::blocked_range(from, std::min(to, _graph->n()), 512), + [&](const tbb::blocked_range &r) { + auto &matching_map = matching_map_ets.local(); + + for (NodeID u = r.begin(); u != r.end(); ++u) { + if (is_considered_for_two_hop_clustering(u)) { + handle_node(matching_map, u); + } + } + } + ); + } + + void match_two_hop_nodes( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + handle_two_hop_nodes_impl(from, to); + } + + void cluster_two_hop_nodes( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + handle_two_hop_nodes_impl(from, to); + } + + template + void handle_two_hop_nodes_impl( + const NodeID from = 0, const NodeID to = std::numeric_limits::max() + ) { + static_assert(Config::kUseTwoHopClustering, "2-hop clustering is disabled"); + + auto is_considered_for_two_hop_clustering = [&](const NodeID u) { + // Skip nodes not considered for two-hop clustering + if (_graph->degree(u) == 0) { + // Not considered: isolated node + return false; + } else if (u != derived_cluster(u)) { + // Not considered: joined another cluster + return false; + } else { + // If u did not join another cluster, there could still be other nodes that joined this + // node's cluster: find out by checking the cluster weight + const ClusterWeight current_weight = derived_cluster_weight(u); + if (current_weight > derived_max_cluster_weight(u) / 2 || + current_weight != derived_initial_cluster_weight(u)) { + // Not considered: not a singleton cluster; or its weight is too heavy + return false; + } + } + + return true; + }; + + // There could be edge cases where the favorite cluster of a node is itself a singleton cluster + // (for instance, if a node joins another cluster during the first round, but moves out of the + // cluster in the next round) + // Since the following code is based on the ansumption that the favorite cluster of a node that + // is considered for two-hop clustering it itself not considere for two-hop clustering, we fix + // this situation by moving the nodes to their favorite cluster, if possible, here. + tbb::parallel_for(from, std::min(to, _graph->n()), [&](const NodeID u) { + if (is_considered_for_two_hop_clustering(u)) { + const NodeID cluster = _favored_clusters[u]; + if (is_considered_for_two_hop_clustering(cluster) && + derived_move_cluster_weight( + u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster) + )) { + derived_move_node(u, cluster); + --_current_num_clusters; + } + } else { + _favored_clusters[u] = u; + } + }); + + KASSERT( + [&] { + for (NodeID u = from; u < std::min(to, _graph->n()); ++u) { + if (_favored_clusters[u] >= _graph->n()) { + LOG_WARNING << "favored cluster of node " << u + << " out of bounds: " << _favored_clusters[u] << " > " << _graph->n(); + } + if (u != _favored_clusters[u] && is_considered_for_two_hop_clustering(u) && + is_considered_for_two_hop_clustering(_favored_clusters[u])) { + LOG_WARNING << "node " << u << " (degree " << _graph->degree(u) << " )" + << " is considered for two-hop clustering, but its favored cluster " + << _favored_clusters[u] << " (degree " + << _graph->degree(_favored_clusters[u]) + << ") is also considered for two-hop clustering"; + return false; + } + } + return true; + }(), + "precondition for two-hop clustering violated: found favored clusters that could be joined", + assert::heavy + ); + + // During label propagation, we store the best cluster for each node in _favored_cluster[] + // regardless of whether there is enough space in the cluster for the node to join. + // We now use this information to merge nodes that could not join any cluster, i.e., + // singleton-clusters by clustering or matching nodes that have favored cluster. + + tbb::parallel_for(from, std::min(to, _graph->n()), [&](const NodeID u) { + if (should_stop()) { + return; + } + + // Skip nodes not considered for two-hop clustering + if (!is_considered_for_two_hop_clustering(u)) { + return; + } + + // Invariant: + // For each node u that is considered for two-hop clustering (i.e., nodes for which the + // following lines of code are executed), _favored_clusters[u] refers to node which *IS NOT* + // considered for two-hop matching. + // + // Reasoning: + // KASSERT() + // + // Conclusion: + // We can use _favored_clusters[u] to build the two-hop clusters. + + const NodeID C = _favored_clusters[u]; + auto &sync = _favored_clusters[C]; + + do { + NodeID cluster = sync; + + if (cluster == C) { + if (sync.compare_exchange_strong(cluster, u)) { + // We are done: other nodes will join our cluster + break; + } + if (cluster == C) { + continue; + } + } + + // Invariant: cluster is a node with favored cluster C + KASSERT( + _favored_clusters[cluster] == C, + "invariant violated by: " << V(u) << V(cluster) << V(C) << V(_favored_clusters[C]) + ); + + // Try to join the cluster: + if constexpr (match) { + // Matching mode: try to build a cluster only containing nodes "cluster" and "u" + if (sync.compare_exchange_strong(cluster, C)) { + [[maybe_unused]] const bool success = derived_move_cluster_weight( + u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster) + ); + KASSERT( + success, + "node " << u << " could be matched with node " << cluster << ": " + << derived_cluster_weight(u) << " + " << derived_cluster_weight(cluster) + << " > " << derived_max_cluster_weight(cluster) + ); + + derived_move_node(u, cluster); + + // We are done: build a cluster with "cluster", reset "sync" to C + break; + } + } else { + // Clustering mode: try to join cluster "cluster" if the weight constraint permits it, + // otherwise try to start a new cluster + if (derived_move_cluster_weight( + u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster) + )) { + derived_move_node(u, cluster); + + // We are done: joined cluster "cluster" + break; + } else if (sync.compare_exchange_strong(cluster, u)) { + // We are done: other nodes will join our cluster + break; + } + } + } while (true); + }); + } + +private: + void reset_state() { + tbb::parallel_invoke( + [&] { + tbb::parallel_for(0, _graph->n(), [&](const NodeID u) { + if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) { + _active[u] = 1; + } + + const ClusterID initial_cluster = derived_initial_cluster(u); + derived_init_cluster(u, initial_cluster); + if constexpr (Config::kUseTwoHopClustering) { + _favored_clusters[u] = initial_cluster; + } + + derived_reset_node_state(u); + }); + }, + [&] { + tbb::parallel_for(0, _initial_num_clusters, [&](const ClusterID cluster) { + derived_init_cluster_weight(cluster, derived_initial_cluster_weight(cluster)); + }); + } + ); + IFSTATS(_expected_total_gain = 0); + _current_num_clusters = _initial_num_clusters; + } + +private: // CRTP calls + //! Return current cluster ID of node \c u. + [[nodiscard]] ClusterID derived_cluster(const NodeID u) { + return static_cast(this)->cluster(u); + } + + //! Initially place \c u in cluster \cluster. + void derived_init_cluster(const NodeID u, const ClusterID cluster) { + static_cast(this)->init_cluster(u, cluster); + } + + //! Change cluster of node \c u to \c cluster. + void derived_move_node(const NodeID u, const ClusterID cluster) { + static_cast(this)->move_node(u, cluster); + } + + //! Return current weight of cluster \c cluster. + [[nodiscard]] ClusterWeight derived_cluster_weight(const ClusterID cluster) { + return static_cast(this)->cluster_weight(cluster); + } + + //! Initially set weight of cluster \cluster to \c weight. + void derived_init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) { + static_cast(this)->init_cluster_weight(cluster, weight); + } + + //! Attempt to move \c delta weight from cluster \c old_cluster to \c + //! new_cluster, which can take at most \c max_weight weight. + [[nodiscard]] bool derived_move_cluster_weight( + const ClusterID old_cluster, + const ClusterID new_cluster, + const ClusterWeight delta, + const ClusterWeight max_weight + ) { + return static_cast(this)->move_cluster_weight( + old_cluster, new_cluster, delta, max_weight + ); + } + + //! Return the maximum weight of cluster \c cluster. + [[nodiscard]] ClusterWeight derived_max_cluster_weight(const ClusterID cluster) { + return static_cast(this)->max_cluster_weight(cluster); + } + + //! Determine whether a node should be moved to a new cluster. + [[nodiscard]] bool derived_accept_cluster(const ClusterSelectionState &state) { + return static_cast(this)->accept_cluster(state); + } + + void derived_reset_node_state(const NodeID u) { + static_cast(this)->reset_node_state(u); + } + + [[nodiscard]] inline bool derived_accept_neighbor(const NodeID u, const NodeID v) { + return static_cast(this)->accept_neighbor(u, v); + } + + [[nodiscard]] inline bool derived_activate_neighbor(const NodeID u) { + return static_cast(this)->activate_neighbor(u); + } + + [[nodiscard]] ClusterID derived_initial_cluster(const NodeID u) { + return static_cast(this)->initial_cluster(u); + } + + [[nodiscard]] ClusterWeight derived_initial_cluster_weight(const ClusterID cluster) { + return static_cast(this)->initial_cluster_weight(cluster); + } + + [[nodiscard]] bool derived_skip_node(const NodeID node) { + return static_cast(this)->skip_node(node); + } + +protected: // Default implementations + void reset_node_state(const NodeID /* node */) {} + + [[nodiscard]] inline bool accept_neighbor(const NodeID /* u */, const NodeID /* v */) { + return true; + } + + [[nodiscard]] inline bool activate_neighbor(const NodeID /* node */) { + return true; + } + + [[nodiscard]] inline ClusterID initial_cluster(const NodeID u) { + return derived_cluster(u); + } + + [[nodiscard]] inline ClusterWeight initial_cluster_weight(const ClusterID cluster) { + return derived_cluster_weight(cluster); + } + + [[nodiscard]] inline bool skip_node(const NodeID /* node */) { + return false; + } + +protected: // Members + //! Graph we operate on, or \c nullptr if \c initialize has not been called + //! yet. + const Graph *_graph{nullptr}; + + //! The number of non-empty clusters before we ran the first iteration of + //! label propagation. + ClusterID _initial_num_clusters; + + //! The current number of non-empty clusters. Only meaningful if empty + //! clusters are being counted. + parallel::Atomic _current_num_clusters; + + //! We stop label propagation if the number of non-empty clusters falls below + //! this threshold. Only has an effect if empty clusters are being counted. + ClusterID _desired_num_clusters = 0; + + //! We do not move nodes with a degree higher than this. However, other nodes + //! may still be moved to the cluster of with degree larger than this + //! threshold. + NodeID _max_degree = std::numeric_limits::max(); + + //! When computing the gain values for a node, this is an upper limit on the + //! number of neighbors of the nodes we consider. Any more neighbors are + //! ignored. + NodeID _max_num_neighbors = std::numeric_limits::max(); + + //! Thread-local map to compute gain values. + tbb::enumerable_thread_specific _rating_map_ets{[this] { + return RatingMap(_num_clusters); + }}; + + //! Flags nodes with at least one node in its neighborhood that changed + //! clusters during the last iteration. Nodes without this flag set must not + //! be considered in the next iteration. + scalable_vector> _active; + + //! If a node cannot join any cluster during an iteration, this vector stores + //! the node's highest rated cluster independent of the maximum cluster + //! weight. This information is used during 2-hop clustering. + scalable_vector> _favored_clusters; + + //! If statistics are enabled, this is the sum of the gain of all moves that + //! were performed. If executed single-thread, this should be equal to the + //! reduction of the edge cut. + parallel::Atomic _expected_total_gain; + +private: + NodeID _num_nodes = 0; + NodeID _num_active_nodes = 0; + ClusterID _num_clusters = 0; +}; + +/*! + * Parallel label propagation template that iterates over nodes in their natural + * order. + * @tparam Derived Derived subclass for static polymorphism. + * @tparam Config Algorithmic configuration and data types. + */ +template +class InOrderLabelPropagation : public LabelPropagation { + static_assert(std::is_base_of_v); + SET_DEBUG(true); + +protected: + using Base = LabelPropagation; + + using Graph = typename Base::Graph; + using ClusterID = typename Base::ClusterID; + using ClusterWeight = typename Base::ClusterWeight; + using EdgeID = typename Base::EdgeID; + using EdgeWeight = typename Base::EdgeWeight; + using NodeID = typename Base::NodeID; + using NodeWeight = typename Base::NodeWeight; + + using Base::handle_node; + using Base::set_max_degree; + using Base::set_max_num_neighbors; + using Base::should_stop; + + NodeID + perform_iteration(const NodeID from = 0, const NodeID to = std::numeric_limits::max()) { + tbb::enumerable_thread_specific num_moved_nodes_ets; + + tbb::parallel_for( + tbb::blocked_range(from, std::min(_graph->n(), to)), + [&](const auto &r) { + EdgeID work_since_update = 0; + NodeID num_removed_clusters = 0; + + auto &num_moved_nodes = num_moved_nodes_ets.local(); + auto &rand = Random::instance(); + auto &rating_map = _rating_map_ets.local(); + + for (NodeID u = r.begin(); u != r.end(); ++u) { + if (_graph->degree(u) > _max_degree) { + continue; + } + + if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) { + if (!_active[u].load(std::memory_order_relaxed)) { + continue; + } + } + + if (work_since_update > Config::kMinChunkSize) { + if (Base::should_stop()) { + return; + } + + _current_num_clusters -= num_removed_clusters; + work_since_update = 0; + num_removed_clusters = 0; + } + + const auto [moved_node, emptied_cluster] = handle_node(u, rand, rating_map); + work_since_update += _graph->degree(u); + if (moved_node) { + ++num_moved_nodes; + } + if (emptied_cluster) { + ++num_removed_clusters; + } + } + } + ); + + return num_moved_nodes_ets.combine(std::plus{}); + } + + using Base::_active; + using Base::_current_num_clusters; + using Base::_graph; + using Base::_max_degree; + using Base::_rating_map_ets; +}; + +/*! + * Parallel label propagation template that iterates over nodes in chunk random + * order. + * @tparam Derived Derived subclass for static polymorphism. + * @tparam Config Algorithmic configuration and data types. + */ +template +class ChunkRandomdLabelPropagation : public LabelPropagation { + using Base = LabelPropagation; + static_assert(std::is_base_of_v); + + SET_DEBUG(false); + +protected: + using Graph = typename Base::Graph; + using ClusterID = typename Base::ClusterID; + using ClusterWeight = typename Base::ClusterWeight; + using EdgeID = typename Base::EdgeID; + using EdgeWeight = typename Base::EdgeWeight; + using NodeID = typename Base::NodeID; + using NodeWeight = typename Base::NodeWeight; + + using Base::handle_node; + using Base::set_max_degree; + using Base::set_max_num_neighbors; + using Base::should_stop; + + void initialize(const Graph *graph, const ClusterID num_clusters) { + Base::initialize(graph, num_clusters); + _chunks.clear(); + _buckets.clear(); + } + + /** + * Performs label propagation on local nodes in range [from, to) in + * chunk-randomized order. + * + * The randomization works in multiple steps: + * - Nodes within the iteration order are split into chunks of consecutive + * nodes. The size of each chunk is determined by + * LabelPropagationConfig::kMinChunkSize, which is a lower bound on the sum of + * the degrees assigned to a chunk (nodes are assigned to a chunk until the + * limit is exceeded). + * - Afterwards, the order of chunk is shuffled. + * - Finally, chunks are processed in parallel. To this end, the nodes + * assigned to a chunk are once more split into sub-chunks, which are then + * processed sequentially and in-order; however, within a sub-chunk, nodes are + * once more shuffled. + * - If available, degree buckets are respected: chunks of smaller buckets are + * processed before chunks of larger buckets. + * + * @param from First node in the iteration range. + * @param to First node that is not part of the iteration range. + * @return Number of nodes that where moved to new blocks / clusters. + */ + NodeID + perform_iteration(const NodeID from = 0, const NodeID to = std::numeric_limits::max()) { + if (from != 0 || to != std::numeric_limits::max()) { + _chunks.clear(); + } + if (_chunks.empty()) { + init_chunks(from, to); + } + shuffle_chunks(); + + tbb::enumerable_thread_specific num_moved_nodes_ets; + parallel::Atomic next_chunk = 0; + + tbb::parallel_for(static_cast(0), _chunks.size(), [&](const std::size_t) { + if (should_stop()) { + return; + } + + auto &local_num_moved_nodes = num_moved_nodes_ets.local(); + auto &local_rand = Random::instance(); + auto &local_rating_map = _rating_map_ets.local(); + NodeID num_removed_clusters = 0; + + const auto chunk_id = next_chunk.fetch_add(1, std::memory_order_relaxed); + const auto &chunk = _chunks[chunk_id]; + const auto &permutation = _random_permutations.get(local_rand); + + const std::size_t num_sub_chunks = + std::ceil(1.0 * (chunk.end - chunk.start) / Config::kPermutationSize); + std::vector sub_chunk_permutation(num_sub_chunks); + std::iota(sub_chunk_permutation.begin(), sub_chunk_permutation.end(), 0); + local_rand.shuffle(sub_chunk_permutation); + + for (std::size_t sub_chunk = 0; sub_chunk < num_sub_chunks; ++sub_chunk) { + for (std::size_t i = 0; i < Config::kPermutationSize; ++i) { + const NodeID u = chunk.start + + Config::kPermutationSize * sub_chunk_permutation[sub_chunk] + + permutation[i % Config::kPermutationSize]; + if (u < chunk.end && _graph->degree(u) < _max_degree && + ((!Config::kUseActiveSetStrategy && !Config::kUseLocalActiveSetStrategy) || + _active[u].load(std::memory_order_relaxed))) { + const auto [moved_node, emptied_cluster] = handle_node(u, local_rand, local_rating_map); + if (moved_node) { + ++local_num_moved_nodes; + } + if (emptied_cluster) { + ++num_removed_clusters; + } + } + } + } + + _current_num_clusters -= num_removed_clusters; + }); + + return num_moved_nodes_ets.combine(std::plus{}); + } + +private: + struct Chunk { + NodeID start; + NodeID end; + }; + + struct Bucket { + std::size_t start; + std::size_t end; + }; + + void shuffle_chunks() { + tbb::parallel_for(0, _buckets.size(), [&](const std::size_t i) { + const auto &bucket = _buckets[i]; + Random::instance().shuffle(_chunks.begin() + bucket.start, _chunks.begin() + bucket.end); + }); + } + + void init_chunks(const NodeID from, NodeID to) { + _chunks.clear(); + _buckets.clear(); + + to = std::min(to, _graph->n()); + + const auto max_bucket = + std::min(math::floor_log2(_max_degree), _graph->number_of_buckets()); + const EdgeID max_chunk_size = std::max(Config::kMinChunkSize, std::sqrt(_graph->m())); + const NodeID max_node_chunk_size = + std::max(Config::kMinChunkSize, std::sqrt(_graph->n())); + + NodeID position = 0; + for (std::size_t bucket = 0; bucket < max_bucket; ++bucket) { + if (position + _graph->bucket_size(bucket) < from || _graph->bucket_size(bucket) == 0) { + position += _graph->bucket_size(bucket); + continue; + } + if (position >= to) { + break; + } + + NodeID remaining_bucket_size = _graph->bucket_size(bucket); + if (from > _graph->first_node_in_bucket(bucket)) { + remaining_bucket_size -= from - _graph->first_node_in_bucket(bucket); + } + const std::size_t bucket_size = + std::min({remaining_bucket_size, to - position, to - from}); + + parallel::Atomic offset = 0; + tbb::enumerable_thread_specific num_chunks_ets; + tbb::enumerable_thread_specific> chunks_ets; + + const std::size_t bucket_start = std::max(_graph->first_node_in_bucket(bucket), from); + + tbb::parallel_for( + static_cast(0), + tbb::this_task_arena::max_concurrency(), + [&](const int) { + auto &chunks = chunks_ets.local(); + auto &num_chunks = num_chunks_ets.local(); + + while (offset < bucket_size) { + const NodeID begin = offset.fetch_add(max_node_chunk_size); + if (begin >= bucket_size) { + break; + } + const NodeID end = std::min(begin + max_node_chunk_size, bucket_size); + + EdgeID current_chunk_size = 0; + NodeID chunk_start = bucket_start + begin; + + for (NodeID i = begin; i < end; ++i) { + const NodeID u = bucket_start + i; + current_chunk_size += _graph->degree(u); + if (current_chunk_size >= max_chunk_size) { + chunks.push_back({chunk_start, u + 1}); + chunk_start = u + 1; + current_chunk_size = 0; + ++num_chunks; + } + } + + if (current_chunk_size > 0) { + chunks.push_back( + {static_cast(chunk_start), static_cast(bucket_start + end)} + ); + ++num_chunks; + } + } + } + ); + + const std::size_t num_chunks = num_chunks_ets.combine(std::plus{}); + + const std::size_t chunks_start = _chunks.size(); + parallel::Atomic pos = chunks_start; + _chunks.resize(chunks_start + num_chunks); + tbb::parallel_for(chunks_ets.range(), [&](auto &r) { + for (auto &chunk : r) { + const std::size_t local_pos = pos.fetch_add(chunk.size()); + std::copy(chunk.begin(), chunk.end(), _chunks.begin() + local_pos); + } + }); + + _buckets.push_back({chunks_start, _chunks.size()}); + + position += _graph->bucket_size(bucket); + } + + // Make sure that we cover all nodes in [from, to) + KASSERT( + [&] { + std::vector hit(to - from); + for (const auto &[start, end] : _chunks) { + KASSERT(start <= end, ""); + EdgeWeight total_work = 0; + + for (NodeID u = start; u < end; ++u) { + KASSERT(from <= u, ""); + KASSERT(u < to, ""); + KASSERT(!hit[u - from], ""); + + hit[u - from] = true; + total_work += _graph->degree(u); + } + } + + for (NodeID u = 0; u < to - from; ++u) { + KASSERT( + _graph->degree(u) == 0u || hit[u], + V(_graph->degree(u)) << V(from) << V(u + from) << V(to) + ); + } + + return true; + }(), + "", + assert::heavy + ); + } + +protected: + using Base::_active; + using Base::_current_num_clusters; + using Base::_graph; + using Base::_max_degree; + using Base::_rating_map_ets; + + RandomPermutations + _random_permutations{}; + std::vector _chunks; + std::vector _buckets; +}; + +template class NonatomicOwnedClusterVector { +public: + explicit NonatomicOwnedClusterVector(const NodeID max_num_nodes) : _clusters(max_num_nodes) { + tbb::parallel_for(0, max_num_nodes, [&](const NodeID u) { _clusters[u] = 0; }); + } + + [[nodiscard]] auto &&take_clusters() { + return std::move(_clusters); + } + + [[nodiscard]] auto &clusters() { + return _clusters; + } + + void init_cluster(const NodeID node, const ClusterID cluster) { + move_node(node, cluster); + } + + [[nodiscard]] ClusterID cluster(const NodeID node) { + KASSERT(node < _clusters.size()); + return __atomic_load_n(&_clusters[node], __ATOMIC_RELAXED); + } + + void move_node(const NodeID node, const ClusterID cluster) { + KASSERT(node < _clusters.size()); + __atomic_store_n(&_clusters[node], cluster, __ATOMIC_RELAXED); + } + + void ensure_cluster_size(const NodeID max_num_nodes) { + if (_clusters.size() < max_num_nodes) { + _clusters.resize(max_num_nodes); + } + } + +private: + NoinitVector _clusters; +}; + +template class OwnedClusterVector { +public: + explicit OwnedClusterVector(const NodeID max_num_nodes) : _clusters(max_num_nodes) {} + + [[nodiscard]] auto &&take_clusters() { + return std::move(_clusters); + } + + [[nodiscard]] auto &clusters() { + return _clusters; + } + + void init_cluster(const NodeID node, const ClusterID cluster) { + _clusters[node] = cluster; + } + + [[nodiscard]] ClusterID cluster(const NodeID node) { + KASSERT(node < _clusters.size()); + return _clusters[node]; + } + + void move_node(const NodeID node, const ClusterID cluster) { + KASSERT(node < _clusters.size()); + _clusters[node] = cluster; + } + + void ensure_cluster_size(const NodeID max_num_nodes) { + if (_clusters.size() < max_num_nodes) { + _clusters.resize(max_num_nodes); + } + } + +private: + scalable_vector> _clusters; +}; + +template class OwnedRelaxedClusterWeightVector { +public: + explicit OwnedRelaxedClusterWeightVector(const ClusterID max_num_clusters) + : _cluster_weights(max_num_clusters) {} + + auto &&take_cluster_weights() { + return std::move(_cluster_weights); + } + + void init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) { + _cluster_weights[cluster] = weight; + } + + ClusterWeight cluster_weight(const ClusterID cluster) { + return _cluster_weights[cluster]; + } + + bool move_cluster_weight( + const ClusterID old_cluster, + const ClusterID new_cluster, + const ClusterWeight delta, + const ClusterWeight max_weight + ) { + if (_cluster_weights[new_cluster] + delta <= max_weight) { + _cluster_weights[new_cluster].fetch_add(delta, std::memory_order_relaxed); + _cluster_weights[old_cluster].fetch_sub(delta, std::memory_order_relaxed); + return true; + } + return false; + } + +private: + scalable_vector> _cluster_weights; +}; +} // namespace kaminpar::dist diff --git a/kaminpar-dist/dkaminpar.cc b/kaminpar-dist/dkaminpar.cc index be5a6ebc..5e180055 100644 --- a/kaminpar-dist/dkaminpar.cc +++ b/kaminpar-dist/dkaminpar.cc @@ -99,12 +99,9 @@ void print_input_summary( if (root && parseable) { LOG << "EXECUTION_MODE num_mpis=" << ctx.parallel.num_mpis << " num_threads=" << ctx.parallel.num_threads; - LOG << "INPUT_GRAPH " - << "global_n=" << graph.global_n() << " " - << "global_m=" << graph.global_m() << " " - << "n=[" << n_str << "] " - << "m=[" << m_str << "] " - << "ghost_n=[" << ghost_n_str << "]"; + LOG << "INPUT_GRAPH " << "global_n=" << graph.global_n() << " " + << "global_m=" << graph.global_m() << " " << "n=[" << n_str << "] " << "m=[" << m_str + << "] " << "ghost_n=[" << ghost_n_str << "]"; } // Output @@ -269,7 +266,9 @@ GlobalEdgeWeight dKaMinPar::compute_partition(const BlockID k, BlockID *partitio // level? // The binary interface already implements graph validation via KaGen, which can be enabled as a // CLI flag. There is no such option when using the library interface. - KASSERT(debug::validate_graph(graph), "input graph failed graph verification", assert::heavy); + KASSERT( + dist::debug::validate_graph(graph), "input graph failed graph verification", assert::heavy + ); // Setup the remaining context options that are passed in via the constructor _ctx.parallel.num_mpis = size; @@ -293,7 +292,7 @@ GlobalEdgeWeight dKaMinPar::compute_partition(const BlockID k, BlockID *partitio STOP_TIMER(); KASSERT( - debug::validate_partition(p_graph), + dist::debug::validate_partition(p_graph), "graph partition verification failed after partitioning", assert::heavy ); diff --git a/kaminpar-dist/graphutils/bfs_extractor.cc b/kaminpar-dist/graphutils/bfs_extractor.cc index c94bb953..bbcd5013 100644 --- a/kaminpar-dist/graphutils/bfs_extractor.cc +++ b/kaminpar-dist/graphutils/bfs_extractor.cc @@ -27,7 +27,6 @@ #include "kaminpar-common/assert.h" #include "kaminpar-common/datastructures/marker.h" #include "kaminpar-common/datastructures/static_array.h" -#include "kaminpar-common/random.h" #include "kaminpar-common/timer.h" namespace kaminpar::dist::graph { @@ -222,7 +221,8 @@ auto BfsExtractor::exchange_explored_subgraphs( std::move(node_weights_recvbufs[pe]), std::move(edge_weights_recvbufs[pe]), std::move(node_mapping_recvbufs[pe]), - std::move(partition_recvbufs[pe])}; + std::move(partition_recvbufs[pe]) + }; }); return fragments; @@ -552,9 +552,9 @@ auto BfsExtractor::combine_fragments(tbb::concurrent_vector &frag }); // Construct shared-memory graph - auto graph = std::make_unique( + auto graph = std::make_unique(std::make_unique( std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights) - ); + )); auto p_graph = std::make_unique(*graph, _p_graph->k(), std::move(partition)); diff --git a/kaminpar-dist/graphutils/rearrangement.h b/kaminpar-dist/graphutils/rearrangement.h index 55047bae..8b3f7420 100644 --- a/kaminpar-dist/graphutils/rearrangement.h +++ b/kaminpar-dist/graphutils/rearrangement.h @@ -7,10 +7,10 @@ ******************************************************************************/ #pragma once -#include "kaminpar-dist/context.h" #include "kaminpar-dist/datastructures/distributed_graph.h" +#include "kaminpar-dist/dkaminpar.h" -#include "kaminpar-common/datastructures/scalable_vector.h" +#include "kaminpar-common/datastructures/static_array.h" namespace kaminpar::dist::graph { DistributedGraph rearrange(DistributedGraph graph, const Context &ctx); diff --git a/kaminpar-dist/graphutils/replicator.cc b/kaminpar-dist/graphutils/replicator.cc index 3b1249f7..988377f9 100644 --- a/kaminpar-dist/graphutils/replicator.cc +++ b/kaminpar-dist/graphutils/replicator.cc @@ -24,7 +24,6 @@ #include "kaminpar-shm/metrics.h" #include "kaminpar-common/datastructures/static_array.h" -#include "kaminpar-common/parallel/atomic.h" namespace kaminpar::dist { SET_DEBUG(false); @@ -188,7 +187,9 @@ shm::Graph replicate_graph_everywhere(const DistributedGraph &graph) { } }); - return {std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights)}; + return {std::make_unique( + std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights) + )}; } DistributedGraph replicate_graph(const DistributedGraph &graph, const int num_replications) { diff --git a/kaminpar-dist/graphutils/subgraph_extractor.cc b/kaminpar-dist/graphutils/subgraph_extractor.cc index 4480026c..fba94b74 100644 --- a/kaminpar-dist/graphutils/subgraph_extractor.cc +++ b/kaminpar-dist/graphutils/subgraph_extractor.cc @@ -27,6 +27,7 @@ #include "kaminpar-common/datastructures/static_array.h" #include "kaminpar-common/math.h" #include "kaminpar-common/parallel/algorithm.h" +#include "kaminpar-common/parallel/atomic.h" #include "kaminpar-common/parallel/vector_ets.h" namespace kaminpar::dist::graph { @@ -541,13 +542,13 @@ std::pair, std::vector>> construct_s } subgraphs_offsets[b].push_back(pos_n); - subgraphs[b] = shm::Graph( + subgraphs[b] = shm::Graph(std::make_unique( std::move(subgraph_nodes), std::move(subgraph_edges), std::move(subgraph_node_weights), std::move(subgraph_edge_weights), false - ); + )); }); return {std::move(subgraphs), std::move(subgraphs_offsets)}; @@ -607,7 +608,8 @@ extract_and_scatter_block_induced_subgraphs(const DistributedPartitionedGraph &p return { std::move(gathered_subgraphs), std::move(offsets), - std::move(extracted_local_subgraphs.mapping)}; + std::move(extracted_local_subgraphs.mapping) + }; } DistributedPartitionedGraph copy_subgraph_partitions( @@ -687,9 +689,7 @@ DistributedPartitionedGraph copy_subgraph_partitions( synchronize_ghost_node_block_ids(new_p_graph); KASSERT( - debug::validate_partition(new_p_graph), - "graph partition in inconsistent state", - assert::heavy + debug::validate_partition(new_p_graph), "graph partition in inconsistent state", assert::heavy ); return new_p_graph; } @@ -788,9 +788,7 @@ DistributedPartitionedGraph copy_duplicated_subgraph_partitions( synchronize_ghost_node_block_ids(new_p_graph); KASSERT( - debug::validate_partition(new_p_graph), - "graph partition in inconsistent state", - assert::heavy + debug::validate_partition(new_p_graph), "graph partition in inconsistent state", assert::heavy ); return new_p_graph; } diff --git a/kaminpar-dist/refinement/lp/clp_refiner.cc b/kaminpar-dist/refinement/lp/clp_refiner.cc index c8d323fd..0ef29a2f 100644 --- a/kaminpar-dist/refinement/lp/clp_refiner.cc +++ b/kaminpar-dist/refinement/lp/clp_refiner.cc @@ -21,11 +21,11 @@ #include "kaminpar-dist/datastructures/distributed_graph.h" #include "kaminpar-dist/datastructures/distributed_partitioned_graph.h" #include "kaminpar-dist/graphutils/communication.h" -#include "kaminpar-dist/metrics.h" #include "kaminpar-common/assert.h" #include "kaminpar-common/datastructures/rating_map.h" #include "kaminpar-common/parallel/algorithm.h" +#include "kaminpar-common/parallel/atomic.h" #include "kaminpar-common/parallel/vector_ets.h" #include "kaminpar-common/random.h" #include "kaminpar-common/timer.h" @@ -382,8 +382,8 @@ NodeID ColoredLPRefiner::perform_best_moves(const ColorID c) { return num_local_moved_nodes; } -auto ColoredLPRefiner::reduce_move_candidates(std::vector &&candidates) - -> std::vector { +auto ColoredLPRefiner::reduce_move_candidates(std::vector &&candidates +) -> std::vector { const int size = mpi::get_comm_size(_p_graph.communicator()); const int rank = mpi::get_comm_rank(_p_graph.communicator()); KASSERT(math::is_power_of_2(size), "#PE must be a power of two", assert::always); @@ -891,7 +891,7 @@ void ColoredLPRefiner::GainStatistics::record_gain(const EdgeWeight gain, const } void ColoredLPRefiner::GainStatistics::summarize_by_size( - const NoinitVector &color_sizes, MPI_Comm comm + const NoinitVector &color_sizes, MPI_Comm comm ) const { KASSERT(!_gain_per_color.empty(), "must call initialize() first"); KASSERT(_gain_per_color.size() <= color_sizes.size()); diff --git a/kaminpar-dist/refinement/lp/clp_refiner.h b/kaminpar-dist/refinement/lp/clp_refiner.h index e23762cf..69d29dd8 100644 --- a/kaminpar-dist/refinement/lp/clp_refiner.h +++ b/kaminpar-dist/refinement/lp/clp_refiner.h @@ -53,7 +53,7 @@ class ColoredLPRefiner : public GlobalRefiner { public: void initialize(ColorID num_colors); void record_gain(EdgeWeight gain, ColorID c); - void summarize_by_size(const NoinitVector &color_sizes, MPI_Comm comm) const; + void summarize_by_size(const NoinitVector &color_sizes, MPI_Comm comm) const; private: std::vector _gain_per_color; diff --git a/kaminpar-dist/refinement/lp/lp_refiner.cc b/kaminpar-dist/refinement/lp/lp_refiner.cc index 324fa37a..f1f69726 100644 --- a/kaminpar-dist/refinement/lp/lp_refiner.cc +++ b/kaminpar-dist/refinement/lp/lp_refiner.cc @@ -14,12 +14,10 @@ #include "kaminpar-dist/datastructures/distributed_graph.h" #include "kaminpar-dist/datastructures/distributed_partitioned_graph.h" +#include "kaminpar-dist/distributed_label_propagation.h" #include "kaminpar-dist/graphutils/communication.h" #include "kaminpar-dist/metrics.h" -#include "kaminpar-shm/label_propagation.h" - -#include "kaminpar-common/datastructures/marker.h" #include "kaminpar-common/datastructures/rating_map.h" #include "kaminpar-common/math.h" #include "kaminpar-common/parallel/vector_ets.h" @@ -41,7 +39,7 @@ struct LPRefinerConfig : public LabelPropagationConfig { }; class LPRefinerImpl final : public ChunkRandomdLabelPropagation { - SET_STATISTICS(false); + SET_STATISTICS_FROM_GLOBAL(); SET_DEBUG(false); using Base = ChunkRandomdLabelPropagation; diff --git a/kaminpar-shm/coarsening/cluster_coarsener.cc b/kaminpar-shm/coarsening/cluster_coarsener.cc index 7d1f70b5..d512568c 100644 --- a/kaminpar-shm/coarsening/cluster_coarsener.cc +++ b/kaminpar-shm/coarsening/cluster_coarsener.cc @@ -7,59 +7,112 @@ ******************************************************************************/ #include "kaminpar-shm/coarsening/cluster_coarsener.h" -#include "kaminpar-common/logger.h" +#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h" +#include "kaminpar-shm/coarsening/max_cluster_weights.h" +#include "kaminpar-shm/factories.h" +#include "kaminpar-shm/kaminpar.h" + +#include "kaminpar-common/assert.h" +#include "kaminpar-common/heap_profiler.h" #include "kaminpar-common/timer.h" namespace kaminpar::shm { -std::pair ClusteringCoarsener::compute_coarse_graph( - const NodeWeight max_cluster_weight, const NodeID to_size -) { +ClusteringCoarsener::ClusteringCoarsener(const Context &ctx, const PartitionContext &p_ctx) + : _clustering_algorithm(factory::create_clusterer(ctx)), + _c_ctx(ctx.coarsening), + _p_ctx(p_ctx) {} + +void ClusteringCoarsener::initialize(const Graph *graph) { + _hierarchy.clear(); + _input_graph = graph; +} + +bool ClusteringCoarsener::coarsen() { + SCOPED_HEAP_PROFILER("Level", std::to_string(_hierarchy.size())); SCOPED_TIMER("Level", std::to_string(_hierarchy.size())); - _clustering_algorithm->set_max_cluster_weight(max_cluster_weight); - _clustering_algorithm->set_desired_cluster_count(to_size); + if (_clustering.size() < current().n()) { + SCOPED_HEAP_PROFILER("Allocation"); + SCOPED_TIMER("Allocation"); + _clustering.resize(current().n()); + } - const auto &clustering = TIMED_SCOPE("Label Propagation") { - return _clustering_algorithm->compute_clustering(*_current_graph); - }; + const bool free_allocated_memory = !keep_allocated_memory(); + const NodeWeight total_node_weight = current().total_node_weight(); + const NodeID prev_n = current().n(); + + START_HEAP_PROFILER("Label Propagation"); + START_TIMER("Label Propagation"); + _clustering_algorithm->set_max_cluster_weight( + compute_max_cluster_weight(_c_ctx, _p_ctx, prev_n, total_node_weight) + ); + _clustering_algorithm->set_desired_cluster_count(0); + _clustering_algorithm->compute_clustering(_clustering, current(), free_allocated_memory); + STOP_TIMER(); + STOP_HEAP_PROFILER(); - auto [c_graph, c_mapping, m_ctx] = TIMED_SCOPE("Contract graph") { - return graph::contract(*_current_graph, clustering, std::move(_contraction_m_ctx)); + START_HEAP_PROFILER("Contract graph"); + auto coarsened = TIMED_SCOPE("Contract graph") { + return contract_clustering(current(), _clustering, _c_ctx.contraction, _contraction_m_ctx); }; - _contraction_m_ctx = std::move(m_ctx); + _hierarchy.push_back(std::move(coarsened)); + STOP_HEAP_PROFILER(); - const bool converged = _c_ctx.coarsening_should_converge(_current_graph->n(), c_graph.n()); + const NodeID next_n = current().n(); + const bool converged = (1.0 - 1.0 * next_n / prev_n) <= _c_ctx.convergence_threshold; - _hierarchy.push_back(std::move(c_graph)); - _mapping.push_back(std::move(c_mapping)); - _current_graph = &_hierarchy.back(); + if (free_allocated_memory) { + _contraction_m_ctx.buckets.free(); + _contraction_m_ctx.buckets_index.free(); + _contraction_m_ctx.all_buffered_nodes.free(); + } - return {_current_graph, !converged}; + return !converged; } PartitionedGraph ClusteringCoarsener::uncoarsen(PartitionedGraph &&p_graph) { - KASSERT(&p_graph.graph() == _current_graph); - KASSERT(!empty(), V(size())); + SCOPED_HEAP_PROFILER("Level", std::to_string(_hierarchy.size())); SCOPED_TIMER("Level", std::to_string(_hierarchy.size())); + const BlockID p_graph_k = p_graph.k(); + const auto p_graph_partition = p_graph.take_raw_partition(); + + auto coarsened = pop_hierarchy(std::move(p_graph)); + const NodeID next_n = current().n(); + + START_HEAP_PROFILER("Allocation"); START_TIMER("Allocation"); - auto mapping{std::move(_mapping.back())}; - _mapping.pop_back(); - _hierarchy.pop_back(); // destroys the graph wrapped in p_graph, but partition - // access is still ok - _current_graph = empty() ? &_input_graph : &_hierarchy.back(); - KASSERT(mapping.size() == _current_graph->n(), V(mapping.size()) << V(_current_graph->n())); - - StaticArray partition(_current_graph->n()); + RECORD("partition") StaticArray partition(next_n); STOP_TIMER(); + STOP_HEAP_PROFILER(); - START_TIMER("Copy partition"); - tbb::parallel_for(static_cast(0), _current_graph->n(), [&](const NodeID u) { - partition[u] = p_graph.block(mapping[u]); - }); + START_TIMER("Project partition"); + coarsened->project(p_graph_partition, partition); STOP_TIMER(); + SCOPED_HEAP_PROFILER("Create graph"); SCOPED_TIMER("Create graph"); - return {*_current_graph, p_graph.k(), std::move(partition)}; + return {current(), p_graph_k, std::move(partition)}; +} + +std::unique_ptr ClusteringCoarsener::pop_hierarchy(PartitionedGraph &&p_graph) { + KASSERT(!empty(), "cannot pop from an empty graph hierarchy", assert::light); + + auto coarsened = std::move(_hierarchy.back()); + _hierarchy.pop_back(); + + KASSERT( + &coarsened->get() == &p_graph.graph(), + "p_graph wraps a different graph (ptr=" + << &p_graph.graph() << ") than the one that was coarsened (ptr=" << &coarsened->get() + << ")", + assert::light + ); + + return coarsened; +} + +bool ClusteringCoarsener::keep_allocated_memory() const { + return level() >= _c_ctx.clustering.max_mem_free_coarsening_level; } } // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/cluster_coarsener.h b/kaminpar-shm/coarsening/cluster_coarsener.h index d79af08e..833cca35 100644 --- a/kaminpar-shm/coarsening/cluster_coarsener.h +++ b/kaminpar-shm/coarsening/cluster_coarsener.h @@ -9,24 +9,15 @@ #include "kaminpar-shm/coarsening/clusterer.h" #include "kaminpar-shm/coarsening/coarsener.h" -#include "kaminpar-shm/context.h" +#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h" #include "kaminpar-shm/datastructures/graph.h" #include "kaminpar-shm/datastructures/partitioned_graph.h" -#include "kaminpar-shm/graphutils/cluster_contraction.h" #include "kaminpar-shm/kaminpar.h" namespace kaminpar::shm { class ClusteringCoarsener : public Coarsener { public: - ClusteringCoarsener( - std::unique_ptr clustering_algorithm, - const Graph &input_graph, - const CoarseningContext &c_ctx - ) - : _input_graph(input_graph), - _current_graph(&input_graph), - _clustering_algorithm(std::move(clustering_algorithm)), - _c_ctx(c_ctx) {} + ClusteringCoarsener(const Context &ctx, const PartitionContext &p_ctx); ClusteringCoarsener(const ClusteringCoarsener &) = delete; ClusteringCoarsener &operator=(const ClusteringCoarsener) = delete; @@ -34,33 +25,33 @@ class ClusteringCoarsener : public Coarsener { ClusteringCoarsener(ClusteringCoarsener &&) = delete; ClusteringCoarsener &operator=(ClusteringCoarsener &&) = delete; - std::pair - compute_coarse_graph(NodeWeight max_cluster_weight, NodeID to_size) final; + void initialize(const Graph *graph) final; + + bool coarsen() final; PartitionedGraph uncoarsen(PartitionedGraph &&p_graph) final; - [[nodiscard]] const Graph *coarsest_graph() const final { - return _current_graph; + [[nodiscard]] const Graph ¤t() const final { + return _hierarchy.empty() ? *_input_graph : _hierarchy.back()->get(); } - [[nodiscard]] std::size_t size() const final { + [[nodiscard]] std::size_t level() const final { return _hierarchy.size(); } - void initialize(const Graph *) final {} +private: + std::unique_ptr pop_hierarchy(PartitionedGraph &&p_graph); - [[nodiscard]] const CoarseningContext &context() const { - return _c_ctx; - } + [[nodiscard]] bool keep_allocated_memory() const; -private: - const Graph &_input_graph; - const Graph *_current_graph; - std::vector _hierarchy; - std::vector> _mapping; + const CoarseningContext &_c_ctx; + const PartitionContext &_p_ctx; + + const Graph *_input_graph; + std::vector> _hierarchy; + StaticArray _clustering{}; std::unique_ptr _clustering_algorithm; - const CoarseningContext &_c_ctx; - graph::contraction::MemoryContext _contraction_m_ctx{}; + contraction::MemoryContext _contraction_m_ctx{}; }; } // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/clusterer.h b/kaminpar-shm/coarsening/clusterer.h index da64e4fd..857bc029 100644 --- a/kaminpar-shm/coarsening/clusterer.h +++ b/kaminpar-shm/coarsening/clusterer.h @@ -10,14 +10,11 @@ #include "kaminpar-shm/datastructures/graph.h" #include "kaminpar-shm/kaminpar.h" -#include "kaminpar-common/datastructures/scalable_vector.h" -#include "kaminpar-common/parallel/atomic.h" +#include "kaminpar-common/datastructures/static_array.h" namespace kaminpar::shm { class Clusterer { public: - using AtomicClusterArray = scalable_vector>; - Clusterer() = default; Clusterer(const Clusterer &) = delete; @@ -39,6 +36,8 @@ class Clusterer { // Clustering function // - virtual const AtomicClusterArray &compute_clustering(const Graph &graph) = 0; + virtual void compute_clustering( + StaticArray &clustering, const Graph &graph, bool free_memory_afterwards + ) = 0; }; } // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/lp_clustering.cc b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.cc similarity index 67% rename from kaminpar-shm/coarsening/lp_clustering.cc rename to kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.cc index 919e76d1..bcf0915b 100644 --- a/kaminpar-shm/coarsening/lp_clustering.cc +++ b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.cc @@ -1,18 +1,16 @@ /****************************************************************************** * Label propagation for graph coarsening / clustering. * - * @file: lp_clustering.cc + * @file: legacy_lp_clusterer.cc * @author: Daniel Seemaier * @date: 29.09.2021 ******************************************************************************/ -#include "kaminpar-shm/coarsening/lp_clustering.h" +#include "kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h" #include -#include "kaminpar-shm/coarsening/clusterer.h" -#include "kaminpar-shm/context.h" #include "kaminpar-shm/datastructures/graph.h" -#include "kaminpar-shm/label_propagation.h" +#include "kaminpar-shm/legacy_label_propagation.h" #include "kaminpar-common/timer.h" @@ -21,42 +19,41 @@ namespace kaminpar::shm { // Actual implementation -- not exposed in header // -struct LPClusteringConfig : public LabelPropagationConfig { +struct LegacyLPClusteringConfig : public LegacyLabelPropagationConfig { using ClusterID = NodeID; using ClusterWeight = BlockWeight; static constexpr bool kTrackClusterCount = true; static constexpr bool kUseTwoHopClustering = true; }; -class LPClusteringImpl final - : public ChunkRandomdLabelPropagation, - public OwnedRelaxedClusterWeightVector, - public OwnedClusterVector, - public Clusterer { +class LegacyLPClusteringImpl final + : public ChunkRandomdLegacyLabelPropagation, + public LegacyOwnedRelaxedClusterWeightVector, + public LegacyNonatomicClusterVectorRef { SET_DEBUG(false); - using Base = ChunkRandomdLabelPropagation; - using ClusterWeightBase = OwnedRelaxedClusterWeightVector; - using ClusterBase = OwnedClusterVector; + using Base = ChunkRandomdLegacyLabelPropagation; + using ClusterWeightBase = LegacyOwnedRelaxedClusterWeightVector; + using ClusterBase = LegacyNonatomicClusterVectorRef; public: - LPClusteringImpl(const NodeID max_n, const CoarseningContext &c_ctx) - : ClusterWeightBase(max_n), - ClusterBase(max_n), - _c_ctx(c_ctx) { - allocate(max_n, max_n); - set_max_degree(c_ctx.lp.large_degree_threshold); - set_max_num_neighbors(c_ctx.lp.max_num_neighbors); + LegacyLPClusteringImpl(const CoarseningContext &c_ctx) : _lp_ctx(c_ctx.clustering.lp) { + set_max_degree(_lp_ctx.large_degree_threshold); + set_max_num_neighbors(_lp_ctx.max_num_neighbors); } - void set_max_cluster_weight(const NodeWeight max_cluster_weight) final { + void set_max_cluster_weight(const NodeWeight max_cluster_weight) { _max_cluster_weight = max_cluster_weight; } - const AtomicClusterArray &compute_clustering(const Graph &graph) final { + void compute_clustering(StaticArray &clustering, const CSRGraph &graph, bool) { + allocate(graph.n(), graph.n()); + allocate_cluster_weights(graph.n()); + + init_clusters_ref(clustering); initialize(&graph, graph.n()); - for (int iteration = 0; iteration < _c_ctx.lp.num_iterations; ++iteration) { + for (int iteration = 0; iteration < _lp_ctx.num_iterations; ++iteration) { SCOPED_TIMER("Iteration", std::to_string(iteration)); if (perform_iteration() == 0) { break; @@ -65,8 +62,6 @@ class LPClusteringImpl final cluster_isolated_nodes(); cluster_two_hop_nodes(); - - return clusters(); } private: @@ -77,7 +72,7 @@ class LPClusteringImpl final return; } - switch (_c_ctx.lp.two_hop_strategy) { + switch (_lp_ctx.two_hop_strategy) { case TwoHopStrategy::MATCH: match_two_hop_nodes(); break; @@ -106,7 +101,7 @@ class LPClusteringImpl final void cluster_isolated_nodes() { SCOPED_TIMER("Handle isolated nodes"); - switch (_c_ctx.lp.isolated_nodes_strategy) { + switch (_lp_ctx.isolated_nodes_strategy) { case IsolatedNodesClusteringStrategy::MATCH: match_isolated_nodes(); break; @@ -133,7 +128,7 @@ class LPClusteringImpl final } [[nodiscard]] bool should_handle_two_hop_nodes() const { - return (1.0 - 1.0 * _current_num_clusters / _graph->n()) <= _c_ctx.lp.two_hop_threshold; + return (1.0 - 1.0 * _current_num_clusters / _graph->n()) <= _lp_ctx.two_hop_threshold; } // @todo: old implementation that should no longer be used @@ -171,15 +166,27 @@ class LPClusteringImpl final // If this works, we set ourself as clustering partners for nodes that have the same favored // cluster we have NodeID expected_value = favored_leader; - if (_favored_clusters[favored_leader].compare_exchange_strong(expected_value, u)) { + if (__atomic_compare_exchange_n( + &_favored_clusters[favored_leader], + &expected_value, + u, + false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST + )) { break; } // If this did not work, there is another node that has the same favored cluster // Try to join the cluster of that node const NodeID partner = expected_value; - if (_favored_clusters[favored_leader].compare_exchange_strong( - expected_value, favored_leader + if (__atomic_compare_exchange_n( + &_favored_clusters[favored_leader], + &expected_value, + favored_leader, + false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST )) { if (move_cluster_weight(u, partner, cluster_weight(u), max_cluster_weight(partner))) { move_node(u, partner); @@ -216,7 +223,7 @@ class LPClusteringImpl final using Base::_current_num_clusters; using Base::_graph; - const CoarseningContext &_c_ctx; + const LabelPropagationCoarseningContext &_lp_ctx; NodeWeight _max_cluster_weight = kInvalidBlockWeight; }; @@ -224,22 +231,30 @@ class LPClusteringImpl final // Exposed wrapper // -LPClustering::LPClustering(const NodeID max_n, const CoarseningContext &c_ctx) - : _core{std::make_unique(max_n, c_ctx)} {} +LegacyLPClustering::LegacyLPClustering(const CoarseningContext &c_ctx) + : _core(std::make_unique(c_ctx)) {} // we must declare the destructor explicitly here, otherwise, it is implicitly -// generated before LabelPropagationClusterCore is complete -LPClustering::~LPClustering() = default; +// generated before LegacyLabelPropagationClusterCore is complete +LegacyLPClustering::~LegacyLPClustering() = default; -void LPClustering::set_max_cluster_weight(const NodeWeight max_cluster_weight) { +void LegacyLPClustering::set_max_cluster_weight(const NodeWeight max_cluster_weight) { _core->set_max_cluster_weight(max_cluster_weight); } -void LPClustering::set_desired_cluster_count(const NodeID count) { +void LegacyLPClustering::set_desired_cluster_count(const NodeID count) { _core->set_desired_num_clusters(count); } -const Clusterer::AtomicClusterArray &LPClustering::compute_clustering(const Graph &graph) { - return _core->compute_clustering(graph); +void LegacyLPClustering::compute_clustering( + StaticArray &clustering, const Graph &graph, bool +) { + if (auto *csr_graph = dynamic_cast(graph.underlying_graph()); + csr_graph != nullptr) { + _core->compute_clustering(clustering, *csr_graph, false); + return; + } + + __builtin_unreachable(); } } // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h new file mode 100644 index 00000000..cfa45a9d --- /dev/null +++ b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h @@ -0,0 +1,36 @@ +/****************************************************************************** + * Label propagation for graph coarsening / clustering. + * + * @file: legacy_lp_clusterer.h + * @author: Daniel Seemaier + * @date: 29.09.2021 + ******************************************************************************/ +#pragma once + +#include "kaminpar-shm/coarsening/clusterer.h" +#include "kaminpar-shm/datastructures/graph.h" + +namespace kaminpar::shm { +class LegacyLPClustering : public Clusterer { +public: + LegacyLPClustering(const CoarseningContext &c_ctx); + + LegacyLPClustering(const LegacyLPClustering &) = delete; + LegacyLPClustering &operator=(const LegacyLPClustering &) = delete; + + LegacyLPClustering(LegacyLPClustering &&) noexcept = default; + LegacyLPClustering &operator=(LegacyLPClustering &&) noexcept = default; + + ~LegacyLPClustering() override; + + void set_max_cluster_weight(NodeWeight max_cluster_weight) final; + void set_desired_cluster_count(NodeID count) final; + + void compute_clustering( + StaticArray &clustering, const Graph &graph, bool free_memory_afterwards + ) final; + +private: + std::unique_ptr _core; +}; +} // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/clustering/lp_clusterer.cc b/kaminpar-shm/coarsening/clustering/lp_clusterer.cc new file mode 100644 index 00000000..bdbf8095 --- /dev/null +++ b/kaminpar-shm/coarsening/clustering/lp_clusterer.cc @@ -0,0 +1,365 @@ +/****************************************************************************** + * Label propagation for graph coarsening / clustering. + * + * @file: lp_clusterer.cc + * @author: Daniel Seemaier + * @date: 29.09.2021 + ******************************************************************************/ +#include "kaminpar-shm/coarsening/clustering/lp_clusterer.h" + +#include "kaminpar-shm/label_propagation.h" + +#include "kaminpar-common/heap_profiler.h" +#include "kaminpar-common/timer.h" + +namespace kaminpar::shm { + +// +// Actual implementation -- not exposed in header +// + +struct LPClusteringConfig : public LabelPropagationConfig { + using ClusterID = NodeID; + using ClusterWeight = BlockWeight; + static constexpr bool kTrackClusterCount = true; + static constexpr bool kUseTwoHopClustering = true; +}; + +template +class LPClusteringImpl final + : public ChunkRandomLabelPropagation, LPClusteringConfig, Graph>, + public OwnedRelaxedClusterWeightVector, + public NonatomicClusterVectorRef { + SET_DEBUG(false); + + using Base = ChunkRandomLabelPropagation; + using ClusterWeightBase = OwnedRelaxedClusterWeightVector; + using ClusterBase = NonatomicClusterVectorRef; + +public: + using Permutations = Base::Permutations; + + LPClusteringImpl(const CoarseningContext &c_ctx, Permutations &permutations) + : Base(permutations), + ClusterWeightBase(c_ctx.clustering.lp.use_two_level_cluster_weight_vector), + _lp_ctx(c_ctx.clustering.lp) { + Base::set_max_degree(_lp_ctx.large_degree_threshold); + Base::set_max_num_neighbors(_lp_ctx.max_num_neighbors); + Base::set_use_two_phases(_lp_ctx.use_two_phases); + Base::set_second_phase_select_mode(_lp_ctx.second_phase_select_mode); + Base::set_second_phase_aggregation_mode(_lp_ctx.second_phase_aggregation_mode); + Base::set_relabel_before_second_phase(_lp_ctx.relabel_before_second_phase); + } + + void set_max_cluster_weight(const NodeWeight max_cluster_weight) { + _max_cluster_weight = max_cluster_weight; + } + + void preinitialize(const NodeID num_nodes) { + Base::preinitialize(num_nodes, num_nodes); + } + + void allocate(const NodeID num_clusters) { + SCOPED_HEAP_PROFILER("Allocation"); + SCOPED_TIMER("Allocation"); + + Base::allocate(); + ClusterWeightBase::allocate_cluster_weights(num_clusters); + } + + void free() { + SCOPED_HEAP_PROFILER("Free"); + SCOPED_TIMER("Free"); + + Base::free(); + ClusterWeightBase::free(); + } + + void compute_clustering(StaticArray &clustering, const Graph &graph) { + ClusterWeightBase::reset_cluster_weights(); + ClusterBase::init_clusters_ref(clustering); + Base::initialize(&graph, graph.n()); + + for (std::size_t iteration = 0; iteration < _lp_ctx.num_iterations; ++iteration) { + SCOPED_TIMER("Iteration", std::to_string(iteration)); + if (Base::perform_iteration() == 0) { + break; + } + + // Only relabel during the first iteration because afterwards the memory for the second phase + // is already allocated. + if (iteration == 0) { + Base::set_relabel_before_second_phase(false); + } + } + + cluster_isolated_nodes(); + cluster_two_hop_nodes(); + } + +private: + void cluster_two_hop_nodes() { + SCOPED_HEAP_PROFILER("Handle two-hop nodes"); + SCOPED_TIMER("Handle two-hop nodes"); + + if (!should_handle_two_hop_nodes()) { + return; + } + + switch (_lp_ctx.two_hop_strategy) { + case TwoHopStrategy::MATCH: + Base::match_two_hop_nodes(); + break; + case TwoHopStrategy::MATCH_THREADWISE: + Base::match_two_hop_nodes_threadwise(); + break; + case TwoHopStrategy::CLUSTER: + Base::cluster_two_hop_nodes(); + break; + case TwoHopStrategy::CLUSTER_THREADWISE: + Base::cluster_two_hop_nodes_threadwise(); + break; + case TwoHopStrategy::LEGACY: + handle_two_hop_clustering_legacy(); + break; + case TwoHopStrategy::DISABLE: + break; + } + } + + void cluster_isolated_nodes() { + SCOPED_HEAP_PROFILER("Handle isolated nodes"); + SCOPED_TIMER("Handle isolated nodes"); + + switch (_lp_ctx.isolated_nodes_strategy) { + case IsolatedNodesClusteringStrategy::MATCH: + Base::match_isolated_nodes(); + break; + case IsolatedNodesClusteringStrategy::CLUSTER: + Base::cluster_isolated_nodes(); + break; + case IsolatedNodesClusteringStrategy::MATCH_DURING_TWO_HOP: + if (should_handle_two_hop_nodes()) { + Base::match_isolated_nodes(); + } + break; + case IsolatedNodesClusteringStrategy::CLUSTER_DURING_TWO_HOP: + if (should_handle_two_hop_nodes()) { + Base::cluster_isolated_nodes(); + } + break; + case IsolatedNodesClusteringStrategy::KEEP: + break; + } + } + + [[nodiscard]] bool should_handle_two_hop_nodes() const { + return (1.0 - 1.0 * _current_num_clusters / _graph->n()) <= _lp_ctx.two_hop_threshold; + } + + // @todo: old implementation that should no longer be used + void handle_two_hop_clustering_legacy() { + // Reset _favored_clusters entries for nodes that are not considered for + // 2-hop clustering, i.e., nodes that are already clustered with at least one other node or + // nodes that have more weight than max_weight/2. + // Set _favored_clusters to dummy entry _graph->n() for isolated nodes + tbb::parallel_for(0, _graph->n(), [&](const NodeID u) { + if (u != cluster(u)) { + Base::_favored_clusters[u] = u; + } else { + const auto initial_weight = initial_cluster_weight(u); + const auto current_weight = ClusterWeightBase::cluster_weight(u); + const auto max_weight = max_cluster_weight(u); + if (current_weight != initial_weight || current_weight > max_weight / 2) { + Base::_favored_clusters[u] = u; + } + } + }); + + tbb::parallel_for(0, _graph->n(), [&](const NodeID u) { + // Abort once we have merged enough clusters to achieve the configured minimum shrink factor + if (Base::should_stop()) { + return; + } + + // Skip nodes that should not be considered during 2-hop clustering + const NodeID favored_leader = Base::_favored_clusters[u]; + if (favored_leader == u) { + return; + } + + do { + // If this works, we set ourself as clustering partners for nodes that have the same favored + // cluster we have + NodeID expected_value = favored_leader; + if (__atomic_compare_exchange_n( + &Base::_favored_clusters[favored_leader], + &expected_value, + u, + false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST + )) { + break; + } + + // If this did not work, there is another node that has the same favored cluster + // Try to join the cluster of that node + const NodeID partner = expected_value; + if (__atomic_compare_exchange_n( + &Base::_favored_clusters[favored_leader], + &expected_value, + favored_leader, + false, + __ATOMIC_SEQ_CST, + __ATOMIC_SEQ_CST + )) { + if (ClusterWeightBase::move_cluster_weight( + u, partner, ClusterWeightBase::cluster_weight(u), max_cluster_weight(partner) + )) { + move_node(u, partner); + --_current_num_clusters; + } + + break; + } + } while (true); + }); + } + +public: + [[nodiscard]] NodeID initial_cluster(const NodeID u) { + return u; + } + + [[nodiscard]] NodeWeight initial_cluster_weight(const NodeID cluster) { + return _graph->node_weight(cluster); + } + + [[nodiscard]] NodeWeight max_cluster_weight(const NodeID /* cluster */) { + return _max_cluster_weight; + } + + [[nodiscard]] bool accept_cluster(const Base::ClusterSelectionState &state) { + return (state.current_gain > state.best_gain || + (state.current_gain == state.best_gain && state.local_rand.random_bool())) && + (state.current_cluster_weight + state.u_weight <= + max_cluster_weight(state.current_cluster) || + state.current_cluster == state.initial_cluster); + } + + using Base::_current_num_clusters; + using Base::_graph; + + const LabelPropagationCoarseningContext &_lp_ctx; + NodeWeight _max_cluster_weight = kInvalidBlockWeight; +}; + +class LPClusteringImplWrapper { +public: + LPClusteringImplWrapper(const CoarseningContext &c_ctx) + : _csr_core(std::make_unique>(c_ctx, _permutations)), + _compact_csr_core(std::make_unique>(c_ctx, _permutations) + ), + _compressed_core(std::make_unique>(c_ctx, _permutations) + ) {} + + void set_max_cluster_weight(const NodeWeight max_cluster_weight) { + _csr_core->set_max_cluster_weight(max_cluster_weight); + _compact_csr_core->set_max_cluster_weight(max_cluster_weight); + _compressed_core->set_max_cluster_weight(max_cluster_weight); + } + + void set_desired_cluster_count(const NodeID count) { + _csr_core->set_desired_num_clusters(count); + _compact_csr_core->set_desired_num_clusters(count); + _compressed_core->set_desired_num_clusters(count); + } + + void compute_clustering( + StaticArray &clustering, const Graph &graph, const bool free_memory_afterwards + ) { + // Compute a clustering and setup/release the data structures used by the core, so that they can + // be shared by all implementations. + const auto compute = [&](auto &core, auto &graph) { + if (_freed) { + _freed = false; + core.allocate(graph.n()); + } else { + core.setup(std::move(_structs)); + core.setup_cluster_weights(std::move(_cluster_weights)); + } + + core.compute_clustering(clustering, graph); + + if (free_memory_afterwards) { + _freed = true; + core.free(); + } else { + _structs = core.release(); + _cluster_weights = core.take_cluster_weights(); + } + }; + + const NodeID num_nodes = graph.n(); + _csr_core->preinitialize(num_nodes); + _compact_csr_core->preinitialize(num_nodes); + _compressed_core->preinitialize(num_nodes); + + if (auto *csr_graph = dynamic_cast(graph.underlying_graph()); + csr_graph != nullptr) { + compute(*_csr_core, *csr_graph); + } else if (auto *compact_csr_graph = + dynamic_cast(graph.underlying_graph()); + compact_csr_graph != nullptr) { + compute(*_compact_csr_core, *compact_csr_graph); + } else if (auto *compressed_graph = + dynamic_cast(graph.underlying_graph()); + compressed_graph != nullptr) { + compute(*_compressed_core, *compressed_graph); + } + + // Only relabel clusters during the first iteration + _csr_core->set_relabel_before_second_phase(false); + _compact_csr_core->set_relabel_before_second_phase(false); + _compressed_core->set_relabel_before_second_phase(false); + } + +private: + std::unique_ptr> _csr_core; + std::unique_ptr> _compact_csr_core; + std::unique_ptr> _compressed_core; + + // The data structures that are used by the LP clusterer and are shared between the + // different implementations. + bool _freed = true; + LPClusteringImpl::Permutations _permutations; + LPClusteringImpl::DataStructures _structs; + LPClusteringImpl::ClusterWeights _cluster_weights; +}; + +// +// Exposed wrapper +// + +LPClustering::LPClustering(const CoarseningContext &c_ctx) + : _impl_wrapper(std::make_unique(c_ctx)) {} + +// we must declare the destructor explicitly here, otherwise, it is implicitly +// generated before LPClusteringImplWrapper is complete +LPClustering::~LPClustering() = default; + +void LPClustering::set_max_cluster_weight(const NodeWeight max_cluster_weight) { + _impl_wrapper->set_max_cluster_weight(max_cluster_weight); +} + +void LPClustering::set_desired_cluster_count(const NodeID count) { + _impl_wrapper->set_desired_cluster_count(count); +} + +void LPClustering::compute_clustering( + StaticArray &clustering, const Graph &graph, const bool free_memory_afterwards +) { + return _impl_wrapper->compute_clustering(clustering, graph, free_memory_afterwards); +} +} // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/lp_clustering.h b/kaminpar-shm/coarsening/clustering/lp_clusterer.h similarity index 75% rename from kaminpar-shm/coarsening/lp_clustering.h rename to kaminpar-shm/coarsening/clustering/lp_clusterer.h index 8e2b87bd..7e81bf06 100644 --- a/kaminpar-shm/coarsening/lp_clustering.h +++ b/kaminpar-shm/coarsening/clustering/lp_clusterer.h @@ -1,20 +1,22 @@ /****************************************************************************** * Label propagation for graph coarsening / clustering. * - * @file: lp_clustering.h + * @file: lp_clusterer.h * @author: Daniel Seemaier * @date: 29.09.2021 ******************************************************************************/ #pragma once +#include + #include "kaminpar-shm/coarsening/clusterer.h" -#include "kaminpar-shm/context.h" #include "kaminpar-shm/datastructures/graph.h" namespace kaminpar::shm { + class LPClustering : public Clusterer { public: - LPClustering(NodeID max_n, const CoarseningContext &c_ctx); + LPClustering(const CoarseningContext &c_ctx); LPClustering(const LPClustering &) = delete; LPClustering &operator=(const LPClustering &) = delete; @@ -27,10 +29,12 @@ class LPClustering : public Clusterer { void set_max_cluster_weight(NodeWeight max_cluster_weight) final; void set_desired_cluster_count(NodeID count) final; - const AtomicClusterArray &compute_clustering(const Graph &graph) final; + void compute_clustering( + StaticArray &clustering, const Graph &graph, bool free_memory_afterwards + ) final; private: - std::unique_ptr _core; + std::unique_ptr _impl_wrapper; }; } // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/clustering/noop_clusterer.h b/kaminpar-shm/coarsening/clustering/noop_clusterer.h new file mode 100644 index 00000000..f6fdcd5d --- /dev/null +++ b/kaminpar-shm/coarsening/clustering/noop_clusterer.h @@ -0,0 +1,45 @@ +/******************************************************************************* + * A dummy clusterer that assigns each node to its own singleton cluster. + * + * @file: noop_clusterer.h + * @author: Daniel Seemaier + * @date: 16.06.2024 + ******************************************************************************/ +#pragma once + +#include "kaminpar-shm/coarsening/clusterer.h" +#include "kaminpar-shm/datastructures/graph.h" +#include "kaminpar-shm/kaminpar.h" + +#include "kaminpar-common/datastructures/static_array.h" + +namespace kaminpar::shm { +class NoopClusterer : public Clusterer { +public: + NoopClusterer() = default; + + NoopClusterer(const NoopClusterer &) = delete; + NoopClusterer &operator=(const NoopClusterer &) = delete; + + NoopClusterer(NoopClusterer &&) noexcept = default; + NoopClusterer &operator=(NoopClusterer &&) noexcept = default; + + // + // Optional options + // + + virtual void set_max_cluster_weight(const NodeWeight /* weight */) {} + virtual void set_desired_cluster_count(const NodeID /* count */) {} + + // + // Clustering function + // + + virtual void compute_clustering( + StaticArray &clustering, const Graph &graph, bool free_memory_afterwards + ) { + tbb::parallel_for(0, graph.n(), [&](const NodeID i) { clustering[i] = i; }); + } +}; +} // namespace kaminpar::shm + diff --git a/kaminpar-shm/coarsening/coarsener.h b/kaminpar-shm/coarsening/coarsener.h index c075053d..e3e608e1 100644 --- a/kaminpar-shm/coarsening/coarsener.h +++ b/kaminpar-shm/coarsening/coarsener.h @@ -1,5 +1,5 @@ /******************************************************************************* - * Interface for graph coarseners. + * Interface for the coarsening phase of multilevel graph partitioning. * * @file: coarsener.h * @author: Daniel Seemaier @@ -13,11 +13,7 @@ namespace kaminpar::shm { /** - * Clustering graphutils. - * - * Call #coarsen() repeatedly to produce a hierarchy of coarse graph. The coarse - * graphs are owned by the clustering graphutils. To unroll the graph hierarchy, - * call #uncoarsen() with a partition of the currently coarsest graph. + * Interface for the coarsening phase of multilevel graph partitioning. */ class Coarsener { public: @@ -32,25 +28,32 @@ class Coarsener { virtual ~Coarsener() = default; /** - * Coarsen the currently coarsest graph with a static maximum node weight. + * Initializes the coarsener with a new toplevel graph. + */ + virtual void initialize(const Graph *graph) = 0; + + /** + * Computes the next level of the graph hierarchy. * - * @param max_cluster_weight Maximum node weight of the coarse graph. - * @param to_size Desired size of the coarse graph. - * @return New coarsest graph and whether coarsening has not converged. + * @return whether coarsening has *not* yet converged. */ - virtual std::pair - compute_coarse_graph(NodeWeight max_cluster_weight, NodeID to_size) = 0; + virtual bool coarsen() = 0; - /** @return The currently coarsest graph, or the input graph, if no coarse - * graphs have been computed so far. */ - [[nodiscard]] virtual const Graph *coarsest_graph() const = 0; + /** + * @return the coarsest graph in the hierarchy. + */ + [[nodiscard]] virtual const Graph ¤t() const = 0; - /** @return Number of coarsest graphs that have already been computed. */ - [[nodiscard]] virtual std::size_t size() const = 0; + /** + * @return number of coarse graphs in the hierarchy. + */ + [[nodiscard]] virtual std::size_t level() const = 0; - /** @return Whether we have not computed any coarse graphs so far. */ + /** + * @return whether we have *not* yet computed any coarse graphs. + */ [[nodiscard]] bool empty() const { - return size() == 0; + return level() == 0; } /** @@ -58,13 +61,11 @@ class Coarsener { * graph and frees the currently coarsest graph, i.e., unrolls one level of * the coarse graph hierarchy. * - * @param p_graph Partition of the currently coarsest graph, i.e., - * `p_graph.graph() == *coarsest_graph()`. - * @return Partition of the new coarsest graph. + * @param p_graph Partition of the currently coarsest graph. + * Precondition: `p_graph.graph() == current()`. + * + * @return partition of the *new* coarsest graph. */ virtual PartitionedGraph uncoarsen(PartitionedGraph &&p_graph) = 0; - - //! Re-initialize this coarsener object with a new graph. - virtual void initialize(const Graph *graph) = 0; }; } // namespace kaminpar::shm diff --git a/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.cc b/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.cc new file mode 100644 index 00000000..41f6f5b3 --- /dev/null +++ b/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.cc @@ -0,0 +1,298 @@ +/******************************************************************************* + * Contraction implementation that uses an edge buffer to store edges before + * building the final graph. + * + * @file: buffered_cluster_contraction.cc + * @author: Daniel Seemaier + * @author: Daniel Salwasser + * @date: 21.09.2021 + ******************************************************************************/ +#include "kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.h" + +#include + +#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h" +#include "kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h" + +#include "kaminpar-common/datastructures/compact_static_array.h" +#include "kaminpar-common/datastructures/rating_map.h" +#include "kaminpar-common/datastructures/static_array.h" +#include "kaminpar-common/heap_profiler.h" +#include "kaminpar-common/timer.h" + +namespace kaminpar::shm::contraction { +namespace { +template