From 14feffdae8aaa1b4b1cb840aa11c1b20d361e6a2 Mon Sep 17 00:00:00 2001 From: Nikolai Maas Date: Mon, 7 Oct 2024 18:45:46 +0200 Subject: [PATCH] integrate community detection --- mt-kahypar/datastructures/CMakeLists.txt | 3 +- mt-kahypar/datastructures/buffered_vector.h | 1 + mt-kahypar/partition/partitioner.cpp | 2 +- .../partition/preprocessing/CMakeLists.txt | 6 +- .../community_detection/parallel_louvain.cpp | 51 ++++++++----- .../community_detection/parallel_louvain.h | 4 +- tools/ml_graph_stats.cc | 75 +++++++++++++++---- tools/neighborhood_computation.h | 8 +- 8 files changed, 111 insertions(+), 39 deletions(-) diff --git a/mt-kahypar/datastructures/CMakeLists.txt b/mt-kahypar/datastructures/CMakeLists.txt index c9d6b1c2f..895fef228 100644 --- a/mt-kahypar/datastructures/CMakeLists.txt +++ b/mt-kahypar/datastructures/CMakeLists.txt @@ -47,7 +47,8 @@ set(ToolsDatastructureSources dynamic_graph.cpp dynamic_graph_factory.cpp dynamic_adjacency_array.cpp - fixed_vertex_support.cpp) + fixed_vertex_support.cpp + graph.cpp) foreach(modtarget IN LISTS TOOLS_TARGETS) target_sources(${modtarget} PRIVATE ${ToolsDatastructureSources}) diff --git a/mt-kahypar/datastructures/buffered_vector.h b/mt-kahypar/datastructures/buffered_vector.h index 86030a68b..4937c9440 100644 --- a/mt-kahypar/datastructures/buffered_vector.h +++ b/mt-kahypar/datastructures/buffered_vector.h @@ -28,6 +28,7 @@ #pragma once #include +#include #include #include diff --git a/mt-kahypar/partition/partitioner.cpp b/mt-kahypar/partition/partitioner.cpp index b0bba00ae..4d00c65f6 100644 --- a/mt-kahypar/partition/partitioner.cpp +++ b/mt-kahypar/partition/partitioner.cpp @@ -274,7 +274,7 @@ namespace mt_kahypar { } timer.stop_timer("construct_graph"); timer.start_timer("perform_community_detection", "Perform Community Detection"); - ds::Clustering communities = community_detection::run_parallel_louvain(graph, context); + ds::Clustering communities = community_detection::run_parallel_louvain(graph, context).back().first; graph.restrictClusteringToHypernodes(hypergraph, communities); hypergraph.setCommunityIDs(std::move(communities)); timer.stop_timer("perform_community_detection"); diff --git a/mt-kahypar/partition/preprocessing/CMakeLists.txt b/mt-kahypar/partition/preprocessing/CMakeLists.txt index 55dab7a5b..ec6ba9beb 100644 --- a/mt-kahypar/partition/preprocessing/CMakeLists.txt +++ b/mt-kahypar/partition/preprocessing/CMakeLists.txt @@ -4,4 +4,8 @@ set(PreprocessingSources foreach(modtarget IN LISTS PARTITIONING_SUITE_TARGETS) target_sources(${modtarget} PRIVATE ${PreprocessingSources}) -endforeach() \ No newline at end of file +endforeach() + +foreach(modtarget IN LISTS TOOLS_TARGETS) + target_sources(${modtarget} PRIVATE ${PreprocessingSources}) +endforeach() diff --git a/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.cpp b/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.cpp index cafd63cdb..486b75a50 100644 --- a/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.cpp +++ b/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.cpp @@ -34,49 +34,66 @@ namespace mt_kahypar::community_detection { template - ds::Clustering local_moving_contract_recurse(Graph& fine_graph, - ParallelLocalMovingModularity& mlv, - const Context& context) { + std::vector> local_moving_contract_recurse(Graph& fine_graph, + ParallelLocalMovingModularity& mlv, + const Context& context) { utils::Timer& timer = utils::Utilities::instance().getTimer(context.utility_id); timer.start_timer("local_moving", "Local Moving"); - ds::Clustering communities(fine_graph.numNodes()); - bool communities_changed = mlv.localMoving(fine_graph, communities); + ds::Clustering own_communities(fine_graph.numNodes()); + bool communities_changed = mlv.localMoving(fine_graph, own_communities); timer.stop_timer("local_moving"); + std::vector> result; if (communities_changed) { timer.start_timer("contraction_cd", "Contraction"); // Contract Communities - Graph coarse_graph = fine_graph.contract(communities, context.preprocessing.community_detection.low_memory_contraction); + Graph coarse_graph = fine_graph.contract(own_communities, context.preprocessing.community_detection.low_memory_contraction); ASSERT(coarse_graph.totalVolume() == fine_graph.totalVolume()); timer.stop_timer("contraction_cd"); + double new_modularity = 0; + double factor = 1 / coarse_graph.totalVolume(); + for (NodeID node: coarse_graph.nodes()) { + double contribution = coarse_graph.nodeVolume(node); + for (const Arc& arc : coarse_graph.arcsOf(node)) { + contribution -= arc.weight; // only count internal edges + } + contribution -= factor * coarse_graph.nodeVolume(node) * coarse_graph.nodeVolume(node); + new_modularity += factor * contribution; + } + result.emplace_back(own_communities, new_modularity); + // Recurse on contracted graph - ds::Clustering coarse_communities = local_moving_contract_recurse(coarse_graph, mlv, context); + auto coarse_communities = local_moving_contract_recurse(coarse_graph, mlv, context); timer.start_timer("project", "Project"); // Prolong Clustering - tbb::parallel_for(UL(0), fine_graph.numNodes(), [&](const NodeID u) { - ASSERT(communities[u] < static_cast(coarse_communities.size())); - communities[u] = coarse_communities[communities[u]]; - }); + for (const auto& [comm, modularity]: coarse_communities) { + ds::Clustering communities(own_communities); // yes, this is an intentional copy + tbb::parallel_for(UL(0), fine_graph.numNodes(), [&](const NodeID u) { + ASSERT(communities[u] < static_cast(comm.size())); + communities[u] = comm[communities[u]]; + }); + result.emplace_back(std::move(communities), modularity); + } timer.stop_timer("project"); } - return communities; + return result; } template - ds::Clustering run_parallel_louvain(Graph& graph, + std::vector> run_parallel_louvain(Graph& graph, const Context& context, bool disable_randomization) { ParallelLocalMovingModularity mlv(context, graph.numNodes(), disable_randomization); - ds::Clustering communities = local_moving_contract_recurse(graph, mlv, context); - return communities; + auto result = local_moving_contract_recurse(graph, mlv, context); + return result; } namespace { - #define LOCAL_MOVING(X) ds::Clustering local_moving_contract_recurse(Graph&, ParallelLocalMovingModularity&, const Context&) - #define PARALLEL_LOUVAIN(X) ds::Clustering run_parallel_louvain(Graph&, const Context&, bool) + #define LOCAL_MOVING(X) std::vector> local_moving_contract_recurse(Graph&, ParallelLocalMovingModularity&, const Context&) + #define PARALLEL_LOUVAIN(X) std::vector> run_parallel_louvain(Graph&, const Context&, bool) } INSTANTIATE_FUNC_WITH_HYPERGRAPHS(LOCAL_MOVING) diff --git a/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.h b/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.h index 783f91810..3ada2fdb7 100644 --- a/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.h +++ b/mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.h @@ -32,11 +32,11 @@ namespace mt_kahypar::community_detection { template - ds::Clustering local_moving_contract_recurse(Graph& fine_graph, + std::vector> local_moving_contract_recurse(Graph& fine_graph, ParallelLocalMovingModularity& mlv, const Context& context); template - ds::Clustering run_parallel_louvain(Graph& graph, + std::vector> run_parallel_louvain(Graph& graph, const Context& context, bool disable_randomization = false); } diff --git a/tools/ml_graph_stats.cc b/tools/ml_graph_stats.cc index 995265fc5..b2b68a67b 100644 --- a/tools/ml_graph_stats.cc +++ b/tools/ml_graph_stats.cc @@ -48,6 +48,7 @@ #include "mt-kahypar/partition/context.h" #include "mt-kahypar/io/hypergraph_factory.h" #include "mt-kahypar/io/hypergraph_io.h" +#include "mt-kahypar/partition/preprocessing/community_detection/parallel_louvain.h" #include "mt-kahypar/utils/cast.h" #include "mt-kahypar/utils/delete.h" #include "mt-kahypar/utils/hypergraph_statistics.h" @@ -59,8 +60,8 @@ using namespace mt_kahypar; namespace po = boost::program_options; -using Graph = ds::StaticGraph; - +using StaticGraph = ds::StaticGraph; +using LouvainGraph = ds::Graph; enum class FeatureType { floatingpoint, @@ -468,7 +469,8 @@ bool float_eq(double left, double right) { } -std::pair> computeGlobalFeatures(const Graph& graph) { +std::pair> computeGlobalFeatures(const StaticGraph& graph, + std::vector>& community_stack) { GlobalFeatures features; std::vector hn_degrees; @@ -479,7 +481,7 @@ std::pair> computeGlobalFeatures(const Gra }); HypernodeID num_nodes = graph.initialNumNodes(); - HyperedgeID num_edges = Graph::is_graph ? graph.initialNumEdges() / 2 : graph.initialNumEdges(); + HyperedgeID num_edges = StaticGraph::is_graph ? graph.initialNumEdges() / 2 : graph.initialNumEdges(); Statistic degree_stats = createStats(hn_degrees, true); features.n = num_nodes; features.m = num_edges; @@ -497,11 +499,33 @@ std::pair> computeGlobalFeatures(const Gra } } - // TODO: modularity + // modularity features + ds::DynamicSparseMap comm_set; + auto modularity_features = [&](size_t i) { + const auto& [clustering, modularity] = community_stack.at(community_stack.size() - i - 1); + comm_set.clear(); + for (PartitionID c: clustering) { + comm_set[c] = 0; + } + uint64_t n_comms = 0; + for (auto _: comm_set) { + n_comms++; + } + return std::make_pair(n_comms, modularity); + }; + std::tie(features.n_communities_0, features.modularity_0) = modularity_features(0); + std::tie(features.n_communities_1, features.modularity_1) = modularity_features(1); + std::tie(features.n_communities_2, features.modularity_2) = modularity_features(2); + if (community_stack.size() > 3 && features.n_communities_1 < 2 * features.n_communities_0) { + // small hack to get more meaningful features + std::tie(features.n_communities_1, features.modularity_1) = modularity_features(2); + std::tie(features.n_communities_2, features.modularity_2) = modularity_features(3); + } + return {features, hn_degrees}; } -N1Features n1FeaturesFromNeighborhood(const Graph& graph, const std::vector& global_degrees, const NeighborhoodResult& data, CliqueComputation* c_comp) { +N1Features n1FeaturesFromNeighborhood(const StaticGraph& graph, const std::vector& global_degrees, const NeighborhoodResult& data, CliqueComputation* c_comp) { N1Features result; HypernodeID num_nodes = data.n1_list.size(); result.degree = num_nodes; @@ -562,7 +586,7 @@ N1Features n1FeaturesFromNeighborhood(const Graph& graph, const std::vector> computeNodeFeatures(const Graph& graph, const std::vector& global_degrees) { +std::vector> computeNodeFeatures(const StaticGraph& graph, const std::vector& global_degrees) { std::vector> result; result.resize(graph.initialNumNodes()); @@ -622,7 +646,8 @@ std::vector> computeNodeFeatures return result; } -std::vector> computeEdgeFeatures(const Graph& graph, const std::vector& global_degrees) { +std::vector> computeEdgeFeatures(const StaticGraph& graph, const std::vector& global_degrees, + const std::vector>& community_stack) { tbb::enumerable_thread_specific>> result_list; tbb::enumerable_thread_specific base_neighborhood(graph.initialNumNodes()); tbb::enumerable_thread_specific result_neighborhood(graph.initialNumNodes()); @@ -670,6 +695,16 @@ std::vector> computeEdgeFeatu HypernodeID dice_divisor = result.intersect_features.degree + result.intersect_features.to_n1_edges + result.intersect_features.to_n2_edges; result.dice_similarity = intersect_size / static_cast(dice_divisor); } + + // community detection + auto equal_communities = [&](size_t i) { + const auto& [clustering, _] = community_stack.at(community_stack.size() - i - 1); + return clustering[u] == clustering[v]; + }; + result.comm_0_equal = equal_communities(0); + result.comm_1_equal = equal_communities(1); + result.comm_2_equal = equal_communities(2); + result_list.local().emplace_back(u, v, result); } }); @@ -719,7 +754,15 @@ int main(int argc, char* argv[]) { })->default_value("metis"), "Input file format: \n" " - hmetis : hMETIS hypergraph file format \n" - " - metis : METIS graph file format"); + " - metis : METIS graph file format") + ("p-louvain-min-vertex-move-fraction", + po::value(&context.preprocessing.community_detection.min_vertex_move_fraction)->value_name( + "")->default_value(0.01), + "Louvain pass terminates if less than that fraction of nodes moves during a pass") + ("p-max-louvain-pass-iterations", + po::value(&context.preprocessing.community_detection.max_pass_iterations)->value_name( + "")->default_value(5), + "Maximum number of iterations over all nodes of one louvain pass"); po::variables_map cmd_vm; po::store(po::parse_command_line(argc, argv, options), cmd_vm); @@ -741,17 +784,23 @@ int main(int argc, char* argv[]) { mt_kahypar::io::readInputFile( context.partition.graph_filename, PresetType::default_preset, InstanceType::graph, context.partition.file_format, true); - Graph& graph = utils::cast(hypergraph); + StaticGraph& graph = utils::cast(hypergraph); double time = std::chrono::duration(std::chrono::high_resolution_clock::now() - start).count(); std::cout << "Starting global feature computation [" << time << "s]" << std::endl; - auto [global_features, degrees] = computeGlobalFeatures(graph); // does not contain locality + LouvainGraph louvain_graph(graph, LouvainEdgeWeight::uniform, StaticGraph::is_graph); + auto community_stack = community_detection::run_parallel_louvain(louvain_graph, context); + ALWAYS_ASSERT(community_stack.size() > 0); + while (community_stack.size() < 3) { + community_stack.insert(community_stack.begin(), community_stack.front()); + } + auto [global_features, degrees] = computeGlobalFeatures(graph, community_stack); // does not contain locality time = std::chrono::duration(std::chrono::high_resolution_clock::now() - start).count(); std::cout << "Starting node feature computation [" << time << "s]" << std::endl; auto node_features = computeNodeFeatures(graph, degrees); time = std::chrono::duration(std::chrono::high_resolution_clock::now() - start).count(); std::cout << "Starting Edge feature computation [" << time << "s]" << std::endl; - auto edge_features = computeEdgeFeatures(graph, degrees); + auto edge_features = computeEdgeFeatures(graph, degrees, community_stack); time = std::chrono::duration(std::chrono::high_resolution_clock::now() - start).count(); std::cout << "Feature computation complete [" << time << "s]" << std::endl; diff --git a/tools/neighborhood_computation.h b/tools/neighborhood_computation.h index 64d3d486d..50a6fc4c1 100644 --- a/tools/neighborhood_computation.h +++ b/tools/neighborhood_computation.h @@ -43,7 +43,7 @@ using namespace mt_kahypar; using FastResetArray = kahypar::ds::FastResetFlagArray<>; -using Graph = ds::StaticGraph; +using StaticGraph = ds::StaticGraph; struct NeighborhoodResult { std::array roots; @@ -83,12 +83,12 @@ class NeighborhoodComputation { } template - NeighborhoodResult computeNeighborhood(const Graph& graph, std::array roots, bool include_two_hop) { + NeighborhoodResult computeNeighborhood(const StaticGraph& graph, std::array roots, bool include_two_hop) { return computeNeighborhood(graph, roots, include_two_hop, [](HypernodeID){ return true; }); } template - NeighborhoodResult computeNeighborhood(const Graph& graph, std::array roots, bool include_two_hop, F filter) { + NeighborhoodResult computeNeighborhood(const StaticGraph& graph, std::array roots, bool include_two_hop, F filter) { static_assert(N > 0 && N <= 2); ALWAYS_ASSERT(n1_list.empty()); NeighborhoodResult result {{roots[0], roots[0]}, n1_list, n1_set, n2_list, n2_set, include_two_hop}; @@ -143,7 +143,7 @@ class CliqueComputation { child = nullptr; } - uint64_t computeMaxCliqueSize(const Graph& graph, const std::vector& nodes) { + uint64_t computeMaxCliqueSize(const StaticGraph& graph, const std::vector& nodes) { current_set.reset(); forbidden.reset(); list.resize(nodes.size());