diff --git a/.clangd b/.clangd
index c7a47fe9..359a391e 100644
--- a/.clangd
+++ b/.clangd
@@ -1,2 +1,2 @@
 CompileFlags:
-  Add: [-std=c++17]
+  Add: [-std=c++20]
diff --git a/.gitignore b/.gitignore
index b3bea2b5..e54d8ae0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 compile_commands.json
+layout.kdl
 .idea
+.vscode
 *~
 cmake-build-*
 build*/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dae02015..c0136f2c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ project(KaMinPar
 
 set(PROJECT_VENDOR "Daniel Seemaier")
 set(PROJECT_CONTACT "daniel.seemaier@kit.edu")
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 
 ################################################################################
 ## Options                                                                    ##
@@ -22,12 +22,15 @@ option(KAMINPAR_BUILD_TESTS "Build unit tests" ON)
 option(KAMINPAR_BUILD_DISTRIBUTED "Build distributed partitioner." OFF)
 option(KAMINPAR_BUILD_APPS "Build binaries." ON)
 option(KAMINPAR_BUILD_BENCHMARKS "Build benchmark binaries." OFF)
+option(KAMINPAR_BUILD_TOOLS "Build tool binaries." OFF)
 
 option(KAMINPAR_BUILD_EXPERIMENTAL_FEATURES "Include experimental features in the build. This might increase compile times drastically." OFF)
 
 # Control how to build
 ######################
-option(KAMINPAR_ENABLE_STATISTICS "Generate and output detailed statistics." ON)
+option(KAMINPAR_ENABLE_HEAP_PROFILING "Profile and output heap memory usage." OFF)
+option(KAMINPAR_ENABLE_PAGE_PROFILING "Profile pages allocated via mmap." OFF)
+option(KAMINPAR_ENABLE_STATISTICS "Generate and output detailed statistics." OFF)
 option(KAMINPAR_ENABLE_TIMERS "Measure running times. Must be set to 'OFF' if the library interface is used from multiple threads simulatinously." ON)
 option(KAMINPAR_ENABLE_TIMER_BARRIERS "Add additional MPI_Barrier() instructions for more accurate time measurements." ON)
 
@@ -37,8 +40,22 @@ option(KAMINPAR_BUILD_WITH_MTUNE_NATIVE "Build with -mtune=native." ON)
 option(KAMINPAR_BUILD_WITH_CCACHE "Use ccache to build." ON)
 option(KAMINPAR_BUILD_WITH_DEBUG_SYMBOLS "Always build with debug symbols, even in Release mode." ON)
 option(KAMINPAR_BUILD_WITH_MTKAHYPAR "If Mt-KaHyPar can be found, build the Mt-KaHyPar initial partitioner." OFF)
+option(KAMINPAR_BUILD_WITH_GROWT "Build the shared-memory partitioner with Growt." ON)
 option(KAMINPAR_BUILD_WITH_PG "Build with the -pg option for profiling." OFF)
 
+# Control graph compression options
+###################################
+option(KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING "Use high-degree encoding for the compressed graph." ON)
+option(KAMINPAR_COMPRESSION_INTERVAL_ENCODING "Use interval encoding for the compressed graph." ON)
+option(KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING "Use run-length encoding for the compressed graph." OFF)
+option(KAMINPAR_COMPRESSION_STREAM_ENCODING "Use stream encoding for the compressed graph." OFF)
+option(KAMINPAR_COMPRESSION_FAST_DECODING "Use fast decoding for the compressed graph." OFF)
+option(KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION "Whether all isolated nodes are the last nodes of the input graph" OFF)
+
+if (KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING AND KAMINPAR_COMPRESSION_STREAM_ENCODING)
+    message(FATAL_ERROR "Either run-length or stream encoding can be used for varints but not both.")
+endif ()
+
 # Control data type sizes
 #########################
 
@@ -109,7 +126,26 @@ if (KAMINPAR_BUILD_WITH_DEBUG_SYMBOLS)
     add_compile_options(-g -g3)
 endif ()
 
-# Set compile flags 
+# Set compile flags
+add_compile_options(-msse4.1)
+
+check_cxx_compiler_flag(-mcx16 COMPILER_SUPPORTS_MCX16)
+if (COMPILER_SUPPORTS_MCX16)
+    add_compile_options(-mcx16)
+else ()
+    message(WARNING "-mcx16 flag not supported by the compiler")
+
+    if (KAMINPAR_BUILD_WITH_GROWT)
+        message(WARNING "-mcx16 flag not supported by the compiler: cannot use growt for the shared-memory partitioner")
+        set(KAMINPAR_BUILD_WITH_GROWT OFF)
+    endif ()
+
+    if (KAMINPAR_BUILD_DISTRIBUTED)
+        message(WARNING "-mcx16 flag not supported by the compiler: cannot build the distributed partitioner")
+        set(KAMINPAR_BUILD_DISTRIBUTED OFF)
+    endif ()
+endif ()
+
 if (KAMINPAR_BUILD_WITH_MTUNE_NATIVE) 
     add_compile_options(-mtune=native -march=native)
 endif ()
@@ -133,7 +169,24 @@ if (KAMINPAR_ENABLE_STATISTICS)
     list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_STATISTICS")
     message(STATUS "Statistics: enabled")
 else ()
-    message(STATIS "Statistics: disabled")
+    message(STATUS "Statistics: disabled")
+endif ()
+
+if (KAMINPAR_ENABLE_HEAP_PROFILING)
+    string(LENGTH "${CMAKE_SOURCE_DIR}/" SOURCE_PATH_SIZE)
+
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_HEAP_PROFILING")
+    list(APPEND KAMINPAR_DEFINITIONS "-DSOURCE_PATH_SIZE=${SOURCE_PATH_SIZE}")
+    message(STATUS "Heap Profiling: enabled")
+else ()
+    message(STATUS "Heap Profiling: disabled")
+endif ()
+
+if (KAMINPAR_ENABLE_PAGE_PROFILING)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_ENABLE_PAGE_PROFILING")
+    message(STATUS "Page Profiling: enabled")
+else ()
+    message(STATUS "Page Profiling: disabled")
 endif ()
 
 if (KAMINPAR_ENABLE_TIMERS)
@@ -150,6 +203,51 @@ else ()
     message(STATUS "Timer barriers: disabled")
 endif ()
 
+message(STATUS "Graph compression summary:")
+
+if (KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING")
+    message("  High-degree encoding: enabled")
+else ()
+    message("  High-degree encoding: disabled")
+endif ()
+
+if (KAMINPAR_COMPRESSION_INTERVAL_ENCODING)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_INTERVAL_ENCODING")
+    message("  Interval encoding: enabled")
+else ()
+    message("  Interval encoding: disabled")
+endif ()
+
+if (KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING")
+    message("  Run-length encoding: enabled")
+else ()
+    message("  Run-length encoding: disabled")
+endif ()
+
+if (KAMINPAR_COMPRESSION_STREAM_ENCODING)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_STREAM_ENCODING")
+    message("  Stream encoding: enabled")
+else ()
+    message("  Stream encoding: disabled")
+endif ()
+
+if (KAMINPAR_COMPRESSION_FAST_DECODING)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_FAST_DECODING")
+    add_compile_options(-mbmi2)
+    message("  Fast decoding: enabled")
+else ()
+    message("  Fast decoding: disabled")
+endif ()
+
+if (KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION")
+    message("  Isolated nodes separation: enabled")
+else ()
+    message("  Isolated nodes separation: disabled")
+endif ()
+
 if (KAMINPAR_64BIT_NODE_IDS OR KAMINPAR_64BIT_IDS)
     list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_64BIT_NODE_IDS")
     set(KAMINPAR_SHM_NODE_ID_STR "std::uint64_t")
@@ -203,6 +301,14 @@ if (KAMINPAR_BUILD_WITH_CCACHE)
     endif ()
 endif ()
 
+if (KAMINPAR_BUILD_WITH_GROWT)
+    list(APPEND KAMINPAR_DEFINITIONS "-DKAMINPAR_USES_GROWT")
+
+    add_subdirectory(external/growt EXCLUDE_FROM_ALL)
+    add_library(growt INTERFACE)
+    target_include_directories(growt SYSTEM INTERFACE "external/growt")
+endif ()
+
 if (KAMINPAR_BUILD_DISTRIBUTED)
     # MPI
     set(MPI_DETERMINE_LIBRARY_VERSION TRUE)
@@ -212,19 +318,6 @@ if (KAMINPAR_BUILD_DISTRIBUTED)
         set(KAMINPAR_BUILD_DISTRIBUTED OFF)
     endif ()
 
-    # Growt (needs -mcx16, i.e., does not work on ARM)
-    check_cxx_compiler_flag(-mcx16 COMPILER_SUPPORTS_MCX16)
-    if (COMPILER_SUPPORTS_MCX16) 
-        add_compile_options(-mcx16)
-    else ()
-        message(WARNING "-mcx16 flag not supported by the compiler: cannot build the distributed partitioner")
-        set(KAMINPAR_BUILD_DISTRIBUTED OFF)
-    endif()
-
-    add_subdirectory(external/growt EXCLUDE_FROM_ALL)
-    add_library(growt INTERFACE)
-    target_include_directories(growt SYSTEM INTERFACE "external/growt")
-
     # Google Sparsehash 
     find_package(Sparsehash REQUIRED)
 endif ()
@@ -290,7 +383,7 @@ endif ()
 
 # Unit tests
 if (KAMINPAR_BUILD_TESTS)
-    add_subdirectory(external/googletest EXCLUDE_FROM_ALL)
+    add_subdirectory(external/googletest EXCLUDE_FROM_ALL SYSTEM)
 
     enable_testing()
     add_subdirectory(tests)
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 00000000..c1ccc7ce
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,60 @@
+{
+    "version": 6,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 21,
+        "patch": 0
+    },
+    "configurePresets": [
+        {
+            "name": "default",
+            "displayName": "Default Config",
+            "cacheVariables": {
+                "KAMINPAR_64BIT_IDS": "OFF",
+                "KAMINPAR_64BIT_EDGE_IDS": "OFF",
+                "KAMINPAR_64BIT_NODE_IDS": "OFF",
+                "KAMINPAR_64BIT_WEIGHTS": "OFF"
+            }
+        },
+        {
+            "name": "distributed",
+            "displayName": "Default Config for dKaMinPar",
+            "cacheVariables": {
+                "KAMINPAR_BUILD_DISTRIBUTED": "ON",
+                "KAMINPAR_64BIT_IDS": "OFF",
+                "KAMINPAR_64BIT_EDGE_IDS": "OFF",
+                "KAMINPAR_64BIT_NODE_IDS": "OFF",
+                "KAMINPAR_64BIT_WEIGHTS": "ON"
+            }
+        },
+        {
+            "name": "compressed",
+            "displayName": "Default Config for KaMinPar with Memory Optimizations",
+            "cacheVariables": {
+                "KAMINPAR_64BIT_EDGE_IDS": "ON",
+                "KAMINPAR_64BIT_WEIGHTS": "ON"
+            }
+        },
+        {
+            "name": "stats",
+            "displayName": "Default Config for KaMinPar with Statistics",
+            "cacheVariables": {
+                "KAMINPAR_ENABLE_STATISTICS": "ON",
+                "KAMINPAR_ENABLE_HEAP_PROFILING": "ON"
+            }
+        },
+
+        {
+            "name": "default-stats",
+            "inherits": ["default", "stats"]
+        },
+        {
+            "name": "compressed-stats",
+            "inherits": ["compressed", "stats"]
+        },
+        {
+            "name": "distributed-stats",
+            "inherits": ["distributed", "stats"]
+        }
+    ]
+}
diff --git a/README.MD b/README.MD
index c47f890e..32c39592 100644
--- a/README.MD
+++ b/README.MD
@@ -21,7 +21,7 @@ Moreover, for large values of k, it is an order of magnitude faster than competi
 Build KaMinPar following the standard CMake steps:
 
 ```shell
-cmake -B build -DCMAKE_BUILD_TYPE=Release -DKAMINPAR_BUILD_DISTRIBUTED=On
+cmake -B build -DCMAKE_BUILD_TYPE=Release --preset=<default|distributed>
 cmake --build build --parallel
 ```
 
@@ -43,7 +43,7 @@ Presets can be viewed by using the `--dump-config` flag; to use a custom preset,
 
 ```shell
 # Write the default preset to a file
-./KaMinPar [-P default|strong|largek] --dump-config > my_preset.ini
+./KaMinPar -P default --dump-config > my_preset.ini
 
 # ... modify the configuration by editing my_preset.ini ...
 
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index df88f15d..c219b590 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -1,7 +1,23 @@
+set(KAMINPAR_IO_SOURCE_FILES
+    io/parhip_parser.h
+    io/parhip_parser.cc
+    io/shm_compressed_graph_binary.h
+    io/shm_compressed_graph_binary.cc
+    io/shm_input_validator.h
+    io/shm_input_validator.cc
+    io/shm_io.h
+    io/shm_io.cc)
+
+add_library(kaminpar_io ${KAMINPAR_IO_SOURCE_FILES})
+target_include_directories(kaminpar_io PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/io/../")
+target_link_libraries(kaminpar_io PUBLIC KaMinPar::KaMinPar KaMinPar::KaMinParCLI11)
+
+add_library(KaMinPar::KaMinParIO ALIAS kaminpar_io)
+
 function(add_shm_app target)
     add_executable(${target} ${ARGN})
     target_include_directories(${target} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-    target_link_libraries(${target} PRIVATE KaMinPar::KaMinPar KaMinPar::KaMinParCLI11)
+    target_link_libraries(${target} PRIVATE KaMinPar::KaMinPar KaMinPar::KaMinParCLI11 KaMinPar::KaMinParIO)
     install(TARGETS ${target})
     message(STATUS "Enabled app: ${target}")
 endfunction()
@@ -16,9 +32,6 @@ function(add_dist_app target)
 endfunction()
 
 add_shm_app(KaMinPar KaMinPar.cc)
-target_sources(KaMinPar PRIVATE 
-    ${CMAKE_CURRENT_SOURCE_DIR}/io/shm_io.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/io/shm_input_validator.cc)
 
 if (TARGET kaminpar_dist)
     add_dist_app(dKaMinPar dKaMinPar.cc)
@@ -31,3 +44,6 @@ if (KAMINPAR_BUILD_BENCHMARKS)
     add_subdirectory(benchmarks)
 endif ()
 
+if (KAMINPAR_BUILD_TOOLS)
+    add_subdirectory(tools)
+endif ()
diff --git a/apps/KaMinPar.cc b/apps/KaMinPar.cc
index 2d011c3e..94115f04 100644
--- a/apps/KaMinPar.cc
+++ b/apps/KaMinPar.cc
@@ -13,16 +13,17 @@
 #include <iostream>
 
 #include <tbb/global_control.h>
-#include <tbb/parallel_for.h>
 
 #if __has_include(<numa.h>)
 #include <numa.h>
 #endif // __has_include(<numa.h>)
 
+#include "kaminpar-shm/datastructures/graph.h"
+
 #include "kaminpar-common/environment.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/strutils.h"
 
-#include "apps/io/shm_input_validator.h"
 #include "apps/io/shm_io.h"
 
 using namespace kaminpar;
@@ -38,14 +39,21 @@ struct ApplicationContext {
 
   int max_timer_depth = 3;
 
+  bool heap_profiler_detailed = false;
+  int heap_profiler_max_depth = 3;
+  bool heap_profiler_print_structs = true;
+  float heap_profiler_min_struct_size = 10;
+
   BlockID k = 0;
 
   bool quiet = false;
   bool experiment = false;
   bool validate = false;
+  bool debug = false;
 
   std::string graph_filename = "";
   std::string partition_filename = "";
+  io::GraphFileFormat graph_file_format = io::GraphFileFormat::METIS;
 };
 
 void setup_context(CLI::App &cli, ApplicationContext &app, Context &ctx) {
@@ -87,12 +95,59 @@ The output should be stored in a file and can be used by the -C,--config option.
       ->check(CLI::NonNegativeNumber)
       ->default_val(app.num_threads);
   cli.add_flag("-E,--experiment", app.experiment, "Use an output format that is easier to parse.");
+  cli.add_flag(
+      "-D,--debug",
+      app.debug,
+      "Same as -E, but print additional debug information (that might impose a running time "
+      "penalty)."
+  );
   cli.add_option(
       "--max-timer-depth", app.max_timer_depth, "Set maximum timer depth shown in result summary."
   );
   cli.add_flag_function("-T,--all-timers", [&](auto) {
     app.max_timer_depth = std::numeric_limits<int>::max();
   });
+  cli.add_option("-f,--graph-file-format", app.graph_file_format)
+      ->transform(CLI::CheckedTransformer(io::get_graph_file_formats()).description(""))
+      ->description(R"(Graph file formats:
+  - metis
+  - parhip)")
+      ->capture_default_str();
+
+  if constexpr (kHeapProfiling) {
+    auto *hp_group = cli.add_option_group("Heap Profiler");
+
+    hp_group
+        ->add_flag(
+            "-H,--hp-print-detailed",
+            app.heap_profiler_detailed,
+            "Show all levels and data structures in the result summary."
+        )
+        ->default_val(app.heap_profiler_detailed);
+    hp_group
+        ->add_option(
+            "--hp-max-depth",
+            app.heap_profiler_max_depth,
+            "Set maximum heap profiler depth shown in the result summary."
+        )
+        ->default_val(app.heap_profiler_max_depth);
+    hp_group
+        ->add_option(
+            "--hp-print-structs",
+            app.heap_profiler_print_structs,
+            "Print data structure memory statistics in the result summary."
+        )
+        ->default_val(app.heap_profiler_print_structs);
+    hp_group
+        ->add_option(
+            "--hp-min-struct-size",
+            app.heap_profiler_min_struct_size,
+            "Sets the minimum size of a data structure in MB to be included in the result summary."
+        )
+        ->default_val(app.heap_profiler_min_struct_size)
+        ->check(CLI::NonNegativeNumber);
+  }
+
   cli.add_option("-o,--output", app.partition_filename, "Output filename for the graph partition.")
       ->capture_default_str();
   cli.add_flag(
@@ -132,26 +187,27 @@ int main(int argc, char *argv[]) {
     std::exit(0);
   }
 
-  // Allocate graph data structures and read graph file
-  StaticArray<EdgeID> xadj;
-  StaticArray<NodeID> adjncy;
-  StaticArray<NodeWeight> vwgt;
-  StaticArray<EdgeWeight> adjwgt;
-
-  if (app.validate) {
-    shm::io::metis::read<true>(app.graph_filename, xadj, adjncy, vwgt, adjwgt);
-    shm::validate_undirected_graph(xadj, adjncy, vwgt, adjwgt);
-  } else {
-    shm::io::metis::read<false>(app.graph_filename, xadj, adjncy, vwgt, adjwgt);
+  if (ctx.compression.enabled && ctx.node_ordering == NodeOrdering::DEGREE_BUCKETS) {
+    std::cout << "The nodes of the compressed graph cannot be rearranged by degree buckets!"
+              << std::endl;
+    std::exit(0);
   }
 
-  const NodeID n = static_cast<NodeID>(xadj.size() - 1);
-  std::vector<BlockID> partition(n);
-
-  EdgeID *xadj_ptr = xadj.data();
-  NodeID *adjncy_ptr = adjncy.data();
-  NodeWeight *vwgt_ptr = !vwgt.empty() ? vwgt.data() : nullptr;
-  EdgeWeight *adjwgt_ptr = !adjwgt.empty() ? adjwgt.data() : nullptr;
+  ENABLE_HEAP_PROFILER();
+
+  // Read the input graph and allocate memory for the partition
+  START_HEAP_PROFILER("Input Graph Allocation");
+  Graph graph = io::read(
+      app.graph_filename,
+      app.graph_file_format,
+      ctx.compression.enabled,
+      ctx.compression.may_dismiss,
+      ctx.node_ordering == NodeOrdering::IMPLICIT_DEGREE_BUCKETS,
+      app.validate
+  );
+  RECORD("partition") std::vector<BlockID> partition(graph.n());
+  RECORD_LOCAL_DATA_STRUCT("vector<BlockID>", partition.capacity() * sizeof(BlockID));
+  STOP_HEAP_PROFILER();
 
   // Compute graph partition
   KaMinPar partitioner(app.num_threads, ctx);
@@ -159,14 +215,26 @@ int main(int argc, char *argv[]) {
 
   if (app.quiet) {
     partitioner.set_output_level(OutputLevel::QUIET);
+  } else if (app.debug) {
+    partitioner.set_output_level(OutputLevel::DEBUG);
   } else if (app.experiment) {
     partitioner.set_output_level(OutputLevel::EXPERIMENT);
   }
 
   partitioner.context().debug.graph_name = str::extract_basename(app.graph_filename);
   partitioner.set_max_timer_depth(app.max_timer_depth);
-  partitioner.take_graph(n, xadj_ptr, adjncy_ptr, vwgt_ptr, adjwgt_ptr);
+  if constexpr (kHeapProfiling) {
+    auto &global_heap_profiler = heap_profiler::HeapProfiler::global();
+    if (app.heap_profiler_detailed) {
+      global_heap_profiler.set_detailed_summary_options();
+    } else {
+      global_heap_profiler.set_max_depth(app.heap_profiler_max_depth);
+      global_heap_profiler.set_print_data_structs(app.heap_profiler_print_structs);
+      global_heap_profiler.set_min_data_struct_size(app.heap_profiler_min_struct_size);
+    }
+  }
 
+  partitioner.set_graph(std::move(graph));
   partitioner.compute_partition(app.k, partition.data());
 
   // Save graph partition
@@ -174,5 +242,7 @@ int main(int argc, char *argv[]) {
     shm::io::partition::write(app.partition_filename, partition);
   }
 
+  DISABLE_HEAP_PROFILER();
+
   return 0;
 }
diff --git a/apps/benchmarks/CMakeLists.txt b/apps/benchmarks/CMakeLists.txt
index 3115d6b7..29cda36a 100644
--- a/apps/benchmarks/CMakeLists.txt
+++ b/apps/benchmarks/CMakeLists.txt
@@ -3,12 +3,16 @@ function(add_shm_benchmark target)
     target_link_libraries(${target} PRIVATE 
         KaMinPar::KaMinPar 
         KaMinPar::KaMinParCLI11 
+        KaMinPar::KaMinParIO 
         KaGen::KaGen)
     message(STATUS "Enabled benchmark: ${target}")
 endfunction()
 
 # Shared-memory benchmarks
+add_shm_benchmark(shm_compressed_graph_benchmark shm_compressed_graph_benchmark.cc)
+add_shm_benchmark(shm_label_propagation_benchmark shm_label_propagation_benchmark.cc)
 add_shm_benchmark(shm_refinement_benchmark shm_refinement_benchmark.cc)
+add_shm_benchmark(shm_variable_length_codec_benchmark shm_variable_length_codec_benchmark.cc)
 add_shm_benchmark(shm_gain_cache_benchmark shm_gain_cache_benchmark.cc)
 
 if (KAMINPAR_BUILD_DISTRIBUTED)
diff --git a/apps/benchmarks/shm_compressed_graph_benchmark.cc b/apps/benchmarks/shm_compressed_graph_benchmark.cc
new file mode 100644
index 00000000..74490f9c
--- /dev/null
+++ b/apps/benchmarks/shm_compressed_graph_benchmark.cc
@@ -0,0 +1,550 @@
+/*******************************************************************************
+ * Graph compression benchmark for the shared-memory algorithm.
+ *
+ * @file:   shm_compressed_graph_benchmark.cc
+ * @author: Daniel Salwasser
+ * @date:   12.11.2023
+ ******************************************************************************/
+#include "kaminpar-cli/CLI11.h"
+
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/graphutils/permutator.h"
+
+#include "kaminpar-common/console_io.h"
+#include "kaminpar-common/heap_profiler.h"
+#include "kaminpar-common/logger.h"
+#include "kaminpar-common/timer.h"
+
+#include "apps/io/shm_io.h"
+
+using namespace kaminpar;
+using namespace kaminpar::shm;
+
+static std::string to_megabytes(std::size_t bytes) {
+  std::stringstream stream;
+  stream << std::fixed << std::setprecision(2) << (bytes / (float)(1024 * 1024));
+  return stream.str();
+}
+
+template <typename T> static bool operator!=(const IotaRange<T> &a, const IotaRange<T> &b) {
+  if (a.begin() == a.end()) {
+    return b.begin() != b.end();
+  }
+
+  return a.begin() != b.begin() || a.end() != b.end();
+};
+
+// See https://github.com/google/benchmark/blob/main/include/benchmark/benchmark.h
+template <class T> static inline void do_not_optimize(T value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+template <typename Graph> static void benchmark_degree(const Graph &graph) {
+  SCOPED_HEAP_PROFILER("Degree");
+  SCOPED_TIMER("Degree");
+
+  for (const auto node : graph.nodes()) {
+    do_not_optimize(graph.degree(node));
+  }
+}
+
+template <typename Graph> static void benchmark_incident_edges(const Graph &graph) {
+  SCOPED_HEAP_PROFILER("Incident Edges");
+  SCOPED_TIMER("Incident Edges");
+
+  for (const auto node : graph.nodes()) {
+    for (const auto incident_edge : graph.incident_edges(node)) {
+      do_not_optimize(incident_edge);
+    }
+  }
+}
+
+template <typename Graph> static void benchmark_adjacent_nodes(const Graph &graph) {
+  SCOPED_HEAP_PROFILER("Adjacent Nodes");
+  SCOPED_TIMER("Adjacent Nodes");
+
+  for (const auto node : graph.nodes()) {
+    graph.adjacent_nodes(node, [&](const auto adjacent_node) { do_not_optimize(adjacent_node); });
+  }
+}
+
+template <typename Graph> static void benchmark_neighbors(const Graph &graph) {
+  SCOPED_HEAP_PROFILER("Neighbors");
+  SCOPED_TIMER("Neighbors");
+
+  for (const auto node : graph.nodes()) {
+    graph.neighbors(node, [](const auto incident_edge, const auto adjacent_node) {
+      do_not_optimize(incident_edge);
+      do_not_optimize(adjacent_node);
+    });
+  }
+}
+
+template <typename Graph> static void benchmark_pfor_neighbors(const Graph &graph) {
+  SCOPED_HEAP_PROFILER("Parallel For Neighbors");
+  SCOPED_TIMER("Parallel For Neighbors");
+
+  for (const auto node : graph.nodes()) {
+    graph.pfor_neighbors(
+        node,
+        std::numeric_limits<NodeID>::max(),
+        1000,
+        [](const auto incident_edge, const auto adjacent_node) {
+          do_not_optimize(incident_edge);
+          do_not_optimize(adjacent_node);
+        }
+    );
+  }
+}
+
+static void expect_equal_size(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  if (graph.n() != compressed_graph.n()) {
+    LOG_ERROR << "The uncompressed graph has " << graph.n()
+              << " nodes and the compressed graph has " << compressed_graph.n() << " nodes!";
+    return;
+  }
+
+  if (graph.m() != compressed_graph.m()) {
+    LOG_ERROR << "The uncompressed graph has " << graph.m()
+              << " edges and the compressed graph has " << compressed_graph.m() << " edges!";
+    return;
+  }
+}
+
+static void expect_equal_nodes(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  if (graph.nodes() != compressed_graph.nodes()) {
+    LOG_ERROR << "The nodes of the compressed and uncompressed graph do not match!";
+    return;
+  }
+}
+
+static void expect_equal_edges(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  if (graph.edges() != compressed_graph.edges()) {
+    LOG_ERROR << "The edges of the compressed and uncompressed graph do not match!";
+    return;
+  }
+}
+
+static void expect_equal_degree(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  for (const auto node : graph.nodes()) {
+    if (graph.degree(node) != compressed_graph.degree(node)) {
+      LOG_ERROR << "The node " << node << " has degree " << compressed_graph.degree(node)
+                << " in the compressed graph and degree " << graph.degree(node)
+                << " in the uncompressed graph!";
+      return;
+    }
+  }
+}
+
+static void
+expect_equal_incident_edges(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  for (const auto node : graph.nodes()) {
+    if (graph.incident_edges(node) != compressed_graph.incident_edges(node)) {
+      LOG_ERROR << "The incident edges of node " << node
+                << " in the compressed and uncompressed graph do not match!";
+      return;
+    }
+  }
+}
+
+static void
+expect_equal_adjacent_nodes(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  std::vector<NodeID> graph_neighbours;
+  std::vector<NodeID> compressed_graph_neighbours;
+
+  for (const NodeID node : graph.nodes()) {
+    graph.adjacent_nodes(node, [&](const NodeID adjacent_node) {
+      graph_neighbours.push_back(adjacent_node);
+    });
+
+    compressed_graph.adjacent_nodes(node, [&](const NodeID adjacent_node) {
+      compressed_graph_neighbours.push_back(adjacent_node);
+    });
+
+    if (graph_neighbours.size() != compressed_graph_neighbours.size()) {
+      LOG_ERROR << "Node " << node << " has " << graph_neighbours.size()
+                << " neighbours in the uncompressed graph but "
+                << compressed_graph_neighbours.size() << " neighbours in the compressed graph!";
+      return;
+    }
+
+    std::sort(graph_neighbours.begin(), graph_neighbours.end());
+    std::sort(compressed_graph_neighbours.begin(), compressed_graph_neighbours.end());
+    if (graph_neighbours != compressed_graph_neighbours) {
+      LOG_ERROR << "The neighbourhood of node " << node
+                << " in the compressed and uncompressed graph does not match!";
+      return;
+    }
+
+    graph_neighbours.clear();
+    compressed_graph_neighbours.clear();
+  }
+}
+
+static void
+expect_equal_neighbours(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  std::vector<EdgeID> graph_incident_edges;
+  std::vector<NodeID> graph_adjacent_node;
+  std::vector<EdgeID> compressed_graph_incident_edges;
+  std::vector<NodeID> compressed_graph_adjacent_node;
+
+  for (const NodeID node : graph.nodes()) {
+    graph.neighbors(node, [&](const auto incident_edge, const auto adjacent_node) {
+      graph_incident_edges.push_back(incident_edge);
+      graph_adjacent_node.push_back(adjacent_node);
+    });
+
+    compressed_graph.neighbors(node, [&](const auto incident_edge, const auto adjacent_node) {
+      compressed_graph_incident_edges.push_back(incident_edge);
+      compressed_graph_adjacent_node.push_back(adjacent_node);
+    });
+
+    if (graph_incident_edges.size() != compressed_graph_incident_edges.size()) {
+      LOG_ERROR << "Node " << node << " has " << graph_incident_edges.size()
+                << " neighbours in the uncompressed graph but "
+                << compressed_graph_incident_edges.size() << " neighbours in the compressed graph!";
+      return;
+    }
+
+    std::sort(graph_incident_edges.begin(), graph_incident_edges.end());
+    std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end());
+    std::sort(compressed_graph_incident_edges.begin(), compressed_graph_incident_edges.end());
+    std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end());
+
+    if (graph_incident_edges != compressed_graph_incident_edges) {
+      LOG_ERROR << "The incident edges of node " << node
+                << " in the compressed and uncompressed graph do not match!";
+      return;
+    }
+
+    if (graph_adjacent_node != compressed_graph_adjacent_node) {
+      LOG_ERROR << "The adjacent nodes of node " << node
+                << " in the compressed and uncompressed graph do not match!";
+      return;
+    }
+
+    graph_incident_edges.clear();
+    graph_adjacent_node.clear();
+    compressed_graph_incident_edges.clear();
+    compressed_graph_adjacent_node.clear();
+  }
+}
+
+static void expect_equal_neighbours_max(CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  std::vector<EdgeID> graph_incident_edges;
+  std::vector<NodeID> graph_adjacent_node;
+  std::vector<EdgeID> compressed_graph_incident_edges;
+  std::vector<NodeID> compressed_graph_adjacent_node;
+
+  graph::reorder_edges_by_compression(graph);
+
+  for (const NodeID node : graph.nodes()) {
+    const NodeID max_neighbor_count = graph.degree(node) / 2;
+
+    graph.neighbors(
+        node,
+        max_neighbor_count,
+        [&](const auto incident_edge, const auto adjacent_node) {
+          graph_incident_edges.push_back(incident_edge);
+          graph_adjacent_node.push_back(adjacent_node);
+        }
+    );
+
+    compressed_graph.neighbors(
+        node,
+        max_neighbor_count,
+        [&](const auto incident_edge, const auto adjacent_node) {
+          compressed_graph_incident_edges.push_back(incident_edge);
+          compressed_graph_adjacent_node.push_back(adjacent_node);
+        }
+    );
+
+    if (graph_incident_edges.size() != compressed_graph_incident_edges.size()) {
+      LOG_ERROR << "Node " << node << " has " << graph_incident_edges.size()
+                << " neighbours in the uncompressed graph but "
+                << compressed_graph_incident_edges.size() << " neighbours in the compressed graph!";
+      return;
+    }
+
+    std::sort(graph_incident_edges.begin(), graph_incident_edges.end());
+    std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end());
+    std::sort(compressed_graph_incident_edges.begin(), compressed_graph_incident_edges.end());
+    std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end());
+
+    if (graph_incident_edges != compressed_graph_incident_edges) {
+      LOG_ERROR << "The incident edges of node " << node
+                << " in the compressed and uncompressed graph do not match!";
+      return;
+    }
+
+    if (graph_adjacent_node != compressed_graph_adjacent_node) {
+      LOG_ERROR << "The adjacent nodes of node " << node
+                << " in the compressed and uncompressed graph do not match!";
+      return;
+    }
+
+    graph_incident_edges.clear();
+    graph_adjacent_node.clear();
+    compressed_graph_incident_edges.clear();
+    compressed_graph_adjacent_node.clear();
+  }
+}
+
+static void
+expect_equal_pfor_neighbors(const CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  tbb::concurrent_vector<NodeID> graph_adjacent_node;
+  tbb::concurrent_vector<NodeID> compressed_graph_adjacent_node;
+
+  for (const NodeID node : graph.nodes()) {
+    graph.pfor_neighbors(
+        node,
+        std::numeric_limits<NodeID>::max(),
+        std::numeric_limits<NodeID>::max(),
+        [&](const EdgeID e, const NodeID v) { graph_adjacent_node.push_back(v); }
+    );
+
+    compressed_graph.pfor_neighbors(
+        node,
+        std::numeric_limits<NodeID>::max(),
+        std::numeric_limits<NodeID>::max(),
+        [&](const EdgeID e, const NodeID v) { compressed_graph_adjacent_node.push_back(v); }
+    );
+
+    if (graph_adjacent_node.size() != compressed_graph_adjacent_node.size()) {
+      LOG_ERROR << "Node " << node << " has " << graph_adjacent_node.size()
+                << " adjacent nodes in the uncompressed graph but "
+                << compressed_graph_adjacent_node.size()
+                << " adjacent node in the compressed graph!";
+      return;
+    }
+
+    std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end());
+    std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end());
+
+    if (graph_adjacent_node != compressed_graph_adjacent_node) {
+      LOG_ERROR << "The adjacent nodes of node " << node
+                << " in the compressed and uncompressed graph do not match!";
+      return;
+    }
+
+    graph_adjacent_node.clear();
+    compressed_graph_adjacent_node.clear();
+  }
+}
+
+static void expect_equal_compressed_graph_edge_weights(
+    const CSRGraph &graph, const CompressedGraph &compressed_graph
+) {
+  std::vector<std::pair<NodeID, EdgeWeight>> csr_graph_edge_weights;
+  std::vector<std::pair<NodeID, EdgeWeight>> compressed_graph_edge_weights;
+
+  for (const NodeID node : graph.nodes()) {
+    graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      csr_graph_edge_weights.emplace_back(adjacent_node, graph.edge_weight(incident_edge));
+    });
+
+    compressed_graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      compressed_graph_edge_weights.emplace_back(
+          adjacent_node, compressed_graph.edge_weight(incident_edge)
+      );
+    });
+
+    if (csr_graph_edge_weights.size() != compressed_graph_edge_weights.size()) {
+      LOG_ERROR << "Node " << node << " has " << csr_graph_edge_weights.size()
+                << " adjacent nodes in the uncompressed graph but "
+                << compressed_graph_edge_weights.size()
+                << " adjacent node in the compressed graph!";
+      return;
+    }
+
+    std::sort(
+        csr_graph_edge_weights.begin(),
+        csr_graph_edge_weights.end(),
+        [](const auto &a, const auto &b) { return a.first < b.first; }
+    );
+
+    std::sort(
+        compressed_graph_edge_weights.begin(),
+        compressed_graph_edge_weights.end(),
+        [](const auto &a, const auto &b) { return a.first < b.first; }
+    );
+
+    if (csr_graph_edge_weights != compressed_graph_edge_weights) {
+      LOG_ERROR << "The edge weights of node " << node
+                << " in the compressed and uncompressed graph do not match!";
+      return;
+    }
+
+    csr_graph_edge_weights.clear();
+    compressed_graph_edge_weights.clear();
+  }
+}
+
+static void expect_equal_rearrange_compressed_edge_weights(
+    CSRGraph &graph, const CompressedGraph &compressed_graph
+) {
+  graph::reorder_edges_by_compression(graph);
+
+  for (const NodeID node : graph.nodes()) {
+    for (const auto [incident_edge, adjacent_node] : graph.neighbors(node)) {
+      if (graph.edge_weight(incident_edge) != compressed_graph.edge_weight(incident_edge)) {
+        LOG_ERROR << "Edge " << incident_edge << " has weight " << graph.edge_weight(incident_edge)
+                  << " in the rearranged uncompressed graph but weight "
+                  << compressed_graph.edge_weight(incident_edge) << " in the compressed graph!";
+        return;
+      }
+    }
+  }
+}
+
+static void run_checks(CSRGraph &graph, const CompressedGraph &compressed_graph) {
+  LOG << "Checking if the graph operations are valid...";
+
+  expect_equal_size(graph, compressed_graph);
+  expect_equal_nodes(graph, compressed_graph);
+  expect_equal_edges(graph, compressed_graph);
+  expect_equal_degree(graph, compressed_graph);
+  expect_equal_incident_edges(graph, compressed_graph);
+  expect_equal_adjacent_nodes(graph, compressed_graph);
+  expect_equal_neighbours(graph, compressed_graph);
+  expect_equal_neighbours_max(graph, compressed_graph);
+  expect_equal_pfor_neighbors(graph, compressed_graph);
+  expect_equal_compressed_graph_edge_weights(graph, compressed_graph);
+  expect_equal_rearrange_compressed_edge_weights(graph, compressed_graph);
+}
+
+static void run_benchmark(CSRGraph graph, CompressedGraph compressed_graph) {
+  LOG << "Running the benchmark...";
+
+  START_HEAP_PROFILER("Uncompressed graph operations");
+  TIMED_SCOPE("Uncompressed graph operations") {
+    benchmark_degree(graph);
+    benchmark_incident_edges(graph);
+    benchmark_adjacent_nodes(graph);
+    benchmark_neighbors(graph);
+    benchmark_pfor_neighbors(graph);
+  };
+  STOP_HEAP_PROFILER();
+
+  START_HEAP_PROFILER("Compressed graph operations");
+  TIMED_SCOPE("Compressed graph operations") {
+    benchmark_degree(compressed_graph);
+    benchmark_incident_edges(compressed_graph);
+    benchmark_adjacent_nodes(compressed_graph);
+    benchmark_neighbors(compressed_graph);
+    benchmark_pfor_neighbors(compressed_graph);
+  };
+  STOP_HEAP_PROFILER();
+
+  Graph graph_csr(std::make_unique<CSRGraph>(std::move(graph)));
+  START_HEAP_PROFILER("Uncompressed underlying graph operations");
+  TIMED_SCOPE("Uncompressed underlying graph operations") {
+    benchmark_degree(graph_csr);
+    benchmark_incident_edges(graph_csr);
+    benchmark_adjacent_nodes(graph_csr);
+    benchmark_neighbors(graph_csr);
+    benchmark_pfor_neighbors(graph_csr);
+  };
+  STOP_HEAP_PROFILER();
+
+  Graph graph_compressed(std::make_unique<CompressedGraph>(std::move(compressed_graph)));
+  START_HEAP_PROFILER("Compressed underlying graph operations");
+  TIMED_SCOPE("Compressed underlying graph operations") {
+    benchmark_degree(graph_compressed);
+    benchmark_incident_edges(graph_compressed);
+    benchmark_adjacent_nodes(graph_compressed);
+    benchmark_neighbors(graph_compressed);
+    benchmark_pfor_neighbors(graph_compressed);
+  };
+  STOP_HEAP_PROFILER();
+}
+
+int main(int argc, char *argv[]) {
+  // Parse CLI arguments
+  std::string graph_filename;
+  int num_threads = 1;
+  bool enable_benchmarks = true;
+  bool enable_checks = false;
+
+  CLI::App app("Shared-memory graph compression benchmark");
+  app.add_option("-G,--graph", graph_filename, "Graph file")->required();
+  app.add_option("-t,--threads", num_threads, "Number of threads")
+      ->check(CLI::NonNegativeNumber)
+      ->default_val(num_threads);
+  app.add_option("-b,--benchmark", enable_benchmarks, "Enable graph operations benchmark")
+      ->default_val(enable_benchmarks);
+  app.add_option("-c,--checks", enable_checks, "Enable compressed graph operations check")
+      ->default_val(enable_checks);
+
+  CLI11_PARSE(app, argc, argv);
+
+  tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads);
+
+  ENABLE_HEAP_PROFILER();
+  GLOBAL_TIMER.reset();
+
+  // Read input graph
+  LOG << "Reading the input graph...";
+
+  START_HEAP_PROFILER("CSR Graph Allocation");
+  CSRGraph graph = TIMED_SCOPE("Read csr graph") {
+    return io::metis::csr_read<false>(graph_filename);
+  };
+  STOP_HEAP_PROFILER();
+
+  START_HEAP_PROFILER("Compressed Graph Allocation");
+  CompressedGraph compressed_graph = TIMED_SCOPE("Read compressed graph") {
+    return *io::metis::compress_read<false>(graph_filename);
+  };
+  STOP_HEAP_PROFILER();
+
+  // Capture graph statistics
+  std::size_t csr_size = graph.raw_nodes().size() * sizeof(Graph::EdgeID) +
+                         graph.raw_edges().size() * sizeof(Graph::NodeID);
+  std::size_t compressed_size = compressed_graph.used_memory();
+  std::size_t high_degree_count = compressed_graph.high_degree_count();
+  std::size_t part_count = compressed_graph.part_count();
+  std::size_t interval_count = compressed_graph.interval_count();
+
+  // Run checks and benchmarks
+  if (enable_checks) {
+    run_checks(graph, compressed_graph);
+  }
+
+  if (enable_benchmarks) {
+    run_benchmark(std::move(graph), std::move(compressed_graph));
+  }
+
+  STOP_TIMER();
+  DISABLE_HEAP_PROFILER();
+
+  // Print the result summary
+  LOG;
+  cio::print_delimiter("Result Summary");
+
+  LOG << "Input graph has " << graph.n() << " vertices and " << graph.m()
+      << " edges. Its density is " << ((graph.m()) / (float)(graph.n() * (graph.n() - 1))) << ".";
+  LOG << "Node weights: " << (graph.node_weighted() ? "yes" : "no")
+      << ", edge weights: " << (graph.edge_weighted() ? "yes" : "no");
+  LOG;
+
+  LOG << "The uncompressed graph uses " << to_megabytes(csr_size) << " mb (" << csr_size
+      << " bytes).";
+  LOG << "The compressed graph uses " << to_megabytes(compressed_size) << " mb (" << compressed_size
+      << " bytes).";
+  float compression_factor = csr_size / (float)compressed_size;
+  LOG << "Thats a compression ratio of " << compression_factor << '.';
+  LOG;
+
+  LOG << high_degree_count << " (" << (high_degree_count / (float)graph.n())
+      << "%) vertices have high degree.";
+  LOG << part_count << " parts result from splitting the neighborhood of high degree nodes.";
+  LOG << interval_count << " vertices/parts use interval encoding.";
+  LOG;
+
+  Timer::global().print_human_readable(std::cout);
+  LOG;
+  PRINT_HEAP_PROFILE(std::cout);
+
+  return 0;
+}
diff --git a/apps/benchmarks/shm_io.h b/apps/benchmarks/shm_io.h
index 723f2841..8c90dead 100644
--- a/apps/benchmarks/shm_io.h
+++ b/apps/benchmarks/shm_io.h
@@ -55,13 +55,13 @@ inline GraphWrapper load_graph(const std::string &graph_name, const bool is_sort
   wrapper.adjncy = kagen_graph.TakeAdjncy<NodeID>();
   wrapper.vwgt = kagen_graph.TakeVertexWeights<NodeWeight>();
   wrapper.adjvwgt = kagen_graph.TakeEdgeWeights<EdgeWeight>();
-  wrapper.graph = std::make_unique<Graph>(
+  wrapper.graph = std::make_unique<Graph>(std::make_unique<CSRGraph>(
       StaticArray<EdgeID>(wrapper.xadj.data(), wrapper.xadj.size()),
       StaticArray<NodeID>(wrapper.adjncy.data(), wrapper.adjncy.size()),
       StaticArray<NodeWeight>(wrapper.vwgt.data(), wrapper.vwgt.size()),
       StaticArray<EdgeWeight>(wrapper.adjvwgt.data(), wrapper.adjvwgt.size()),
       is_sorted
-  );
+  ));
 
   std::cout << "Loaded graph with n=" << wrapper.graph->n() << ", m=" << wrapper.graph->m()
             << std::endl;
diff --git a/apps/benchmarks/shm_label_propagation_benchmark.cc b/apps/benchmarks/shm_label_propagation_benchmark.cc
new file mode 100644
index 00000000..ac8d2481
--- /dev/null
+++ b/apps/benchmarks/shm_label_propagation_benchmark.cc
@@ -0,0 +1,115 @@
+/*******************************************************************************
+ * Generic label propagation benchmark for the shared-memory algorithm.
+ *
+ * @file:   shm_label_propagation_benchmark.cc
+ * @author: Daniel Salwasser
+ * @date:   13.12.2023
+ ******************************************************************************/
+// clang-format off
+#include <kaminpar-cli/kaminpar_arguments.h>
+// clang-format on
+
+#include <tbb/global_control.h>
+
+#include "kaminpar-shm/coarsening/lp_clustering.h"
+#include "kaminpar-shm/context_io.h"
+#include "kaminpar-shm/graphutils/permutator.h"
+#include "kaminpar-shm/partition_utils.h"
+
+#include "kaminpar-common/console_io.h"
+#include "kaminpar-common/logger.h"
+#include "kaminpar-common/random.h"
+#include "kaminpar-common/timer.h"
+
+#include "apps/io/shm_io.h"
+
+using namespace kaminpar;
+using namespace kaminpar::shm;
+
+int main(int argc, char *argv[]) {
+  // Create context
+  Context ctx = create_default_context();
+
+  // Parse CLI arguments
+  std::string graph_filename;
+  int seed = 0;
+
+  CLI::App app("Shared-memory LP benchmark");
+  app.add_option("-G,--graph", graph_filename, "Graph file")->required();
+  app.add_option("-t,--threads", ctx.parallel.num_threads, "Number of threads");
+  app.add_option("-s,--seed", seed, "Seed for random number generation.")->default_val(seed);
+  app.add_option("-k,--k", ctx.partition.k, "Number of blocks in the partition.")->required();
+  app.add_option(
+         "-e,--epsilon",
+         ctx.partition.epsilon,
+         "Maximum allowed imbalance, e.g. 0.03 for 3%. Must be strictly positive."
+  )
+      ->check(CLI::NonNegativeNumber)
+      ->capture_default_str();
+  create_lp_coarsening_options(&app, ctx);
+  create_partitioning_rearrangement_options(&app, ctx);
+  create_graph_compression_options(&app, ctx);
+  CLI11_PARSE(app, argc, argv);
+
+  tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
+  Random::reseed(seed);
+
+  Graph graph = io::read(
+      graph_filename,
+      io::GraphFileFormat::METIS,
+      ctx.compression.enabled,
+      ctx.compression.may_dismiss,
+      ctx.node_ordering == NodeOrdering::IMPLICIT_DEGREE_BUCKETS,
+      false
+  );
+  ctx.setup(graph);
+
+  const double original_epsilon = ctx.partition.epsilon;
+  if (ctx.node_ordering == NodeOrdering::DEGREE_BUCKETS) {
+    CSRGraph &csr_graph = *dynamic_cast<CSRGraph *>(graph.underlying_graph());
+    graph = graph::rearrange_by_degree_buckets(csr_graph);
+  }
+
+  if (graph.sorted()) {
+    graph::remove_isolated_nodes(graph, ctx.partition);
+  }
+
+  const NodeWeight max_cluster_weight =
+      compute_max_cluster_weight(ctx.coarsening, graph, ctx.partition);
+
+  LPClustering lp_clustering(graph.n(), ctx.coarsening);
+  lp_clustering.set_max_cluster_weight(max_cluster_weight);
+  lp_clustering.set_desired_cluster_count(0);
+
+  GLOBAL_TIMER.reset();
+
+  ENABLE_HEAP_PROFILER();
+  START_HEAP_PROFILER("Label Propagation");
+  TIMED_SCOPE("Label Propagation") {
+    lp_clustering.compute_clustering(graph, false);
+  };
+  STOP_HEAP_PROFILER();
+  DISABLE_HEAP_PROFILER();
+
+  STOP_TIMER();
+
+  if (graph.sorted()) {
+    graph::integrate_isolated_nodes(graph, original_epsilon, ctx);
+  }
+
+  cio::print_delimiter("Input Summary", '#');
+  std::cout << "Execution mode:               " << ctx.parallel.num_threads << "\n";
+  std::cout << "Seed:                         " << Random::get_seed() << "\n";
+  cio::print_delimiter("Graph Compression", '-');
+  print(ctx.compression, std::cout);
+  cio::print_delimiter("Coarsening", '-');
+  print(ctx.coarsening, std::cout);
+
+  cio::print_delimiter("Result Summary");
+  Timer::global().print_human_readable(std::cout);
+  LOG;
+  heap_profiler::HeapProfiler::global().set_detailed_summary_options();
+  PRINT_HEAP_PROFILE(std::cout);
+
+  return 0;
+}
diff --git a/apps/benchmarks/shm_variable_length_codec_benchmark.cc b/apps/benchmarks/shm_variable_length_codec_benchmark.cc
new file mode 100644
index 00000000..9e9db459
--- /dev/null
+++ b/apps/benchmarks/shm_variable_length_codec_benchmark.cc
@@ -0,0 +1,403 @@
+/*******************************************************************************
+ * Variable length codec benchmark for the shared-memory algorithm.
+ *
+ * @file:   shm_variable_length_codec_benchmark.cc
+ * @author: Daniel Salwasser
+ * @date:   12.11.2023
+ ******************************************************************************/
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include "kaminpar-cli/CLI11.h"
+
+#include "kaminpar-common/console_io.h"
+#include "kaminpar-common/logger.h"
+#include "kaminpar-common/timer.h"
+#include "kaminpar-common/varint_codec.h"
+#include "kaminpar-common/varint_run_length_codec.h"
+#include "kaminpar-common/varint_stream_codec.h"
+
+using namespace kaminpar;
+
+enum class IntType {
+  INT_32,
+  INT_64
+};
+
+std::unordered_map<std::string, IntType> get_int_types() {
+  return {
+      {"int32", IntType::INT_32},
+      {"int64", IntType::INT_64},
+  };
+}
+
+template <class T> static inline void do_not_optimize(T value) {
+  asm volatile("" : "+m"(value) : : "memory");
+}
+
+template <typename Int> std::vector<Int> generate_random_values(const std::size_t count) {
+  std::vector<Int> random_values;
+  random_values.resize(count);
+
+  std::random_device dev;
+  std::mt19937 rng(dev());
+  std::uniform_int_distribution<std::mt19937::result_type> dist(
+      std::numeric_limits<Int>::min(), std::numeric_limits<Int>::max()
+  );
+  for (std::size_t i = 0; i < count; ++i) {
+    random_values[i] = dist(rng);
+  }
+
+  return random_values;
+}
+
+template <typename Int, typename Lambda>
+std::unique_ptr<std::uint8_t[]>
+encode_values(std::string_view name, const std::size_t count, Lambda &&l) {
+  auto encoded_values = std::make_unique<std::uint8_t[]>(count * varint_max_length<Int>());
+
+  TIMED_SCOPE(name) {
+    std::uint8_t *ptr = encoded_values.get();
+
+    for (std::size_t i = 0; i < count; ++i) {
+      const std::size_t bytes_written = varint_encode(l(i), ptr);
+      ptr += bytes_written;
+    }
+  };
+
+  return encoded_values;
+}
+
+template <typename Int, typename Lambda>
+std::unique_ptr<std::uint8_t[]>
+encode_signed_values(std::string_view name, const std::size_t count, Lambda &&l) {
+  auto encoded_values = std::make_unique<std::uint8_t[]>(count * varint_max_length<Int>());
+
+  TIMED_SCOPE(name) {
+    std::uint8_t *ptr = encoded_values.get();
+
+    for (std::size_t i = 0; i < count; ++i) {
+      const std::size_t bytes_written = signed_varint_encode(l(i), ptr);
+      ptr += bytes_written;
+    }
+  };
+
+  return encoded_values;
+}
+
+template <typename Int, typename Lambda>
+std::unique_ptr<std::uint8_t[]>
+rl_encode_values(std::string_view name, const std::size_t count, Lambda &&l) {
+  auto encoded_values = std::make_unique<std::uint8_t[]>(count * sizeof(Int) + count);
+
+  TIMED_SCOPE(name) {
+    VarIntRunLengthEncoder<Int> encoder(encoded_values.get());
+
+    for (std::size_t i = 0; i < count; ++i) {
+      const std::size_t bytes_written = encoder.add(l(i));
+      do_not_optimize(bytes_written);
+    }
+
+    encoder.flush();
+  };
+
+  return encoded_values;
+}
+
+template <typename Int, typename Lambda>
+std::unique_ptr<std::uint8_t[]>
+sv_encode_values(std::string_view name, const std::size_t count, Lambda &&l) {
+  auto encoded_values = std::make_unique<std::uint8_t[]>(count * sizeof(Int) + count);
+
+  TIMED_SCOPE(name) {
+    VarIntStreamEncoder<Int> encoder(encoded_values.get(), count);
+
+    for (std::size_t i = 0; i < count; ++i) {
+      const std::size_t bytes_written = encoder.add(l(i));
+      do_not_optimize(bytes_written);
+    }
+
+    encoder.flush();
+  };
+
+  return encoded_values;
+}
+
+template <typename Int>
+std::tuple<
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>>
+encode_values(const std::size_t count, const std::vector<Int> &random_values) {
+  SCOPED_TIMER("Encoding");
+
+  return std::make_tuple(
+      encode_values<Int>("Encoding zero values", count, [](const std::size_t i) { return 0; }),
+      encode_values<Int>(
+          "Encoding max values",
+          count,
+          [](const std::size_t i) { return std::numeric_limits<Int>::max(); }
+      ),
+      encode_values<Int>(
+          "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; }
+      )
+  );
+}
+
+template <typename Int>
+std::tuple<
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>>
+encode_signed_values(const std::size_t count, const std::vector<Int> &random_values) {
+  SCOPED_TIMER("Encoding signed values");
+
+  return std::make_tuple(
+      encode_signed_values<Int>(
+          "Encoding zero values", count, [](const std::size_t i) { return 0; }
+      ),
+      encode_signed_values<Int>(
+          "Encoding max values",
+          count,
+          [](const std::size_t i) { return std::numeric_limits<Int>::max(); }
+      ),
+      encode_signed_values<Int>(
+          "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; }
+      )
+  );
+}
+
+template <typename Int>
+std::tuple<
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>>
+rl_encode_values(const std::size_t count, const std::vector<Int> &random_values) {
+  SCOPED_TIMER("Encoding run-length");
+
+  return std::make_tuple(
+      rl_encode_values<Int>("Encoding zero values", count, [](const std::size_t i) { return 0; }),
+      rl_encode_values<Int>(
+          "Encoding max values",
+          count,
+          [](const std::size_t i) { return std::numeric_limits<Int>::max(); }
+      ),
+      rl_encode_values<Int>(
+          "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; }
+      )
+  );
+}
+
+template <typename Int>
+std::tuple<
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>,
+    std::unique_ptr<std::uint8_t[]>>
+sv_encode_values(const std::size_t count, const std::vector<Int> &random_values) {
+  SCOPED_TIMER("Encoding stream");
+
+  return std::make_tuple(
+      sv_encode_values<Int>("Encoding zero values", count, [](const std::size_t i) { return 0; }),
+      sv_encode_values<Int>(
+          "Encoding max values",
+          count,
+          [](const std::size_t i) { return std::numeric_limits<Int>::max(); }
+      ),
+      sv_encode_values<Int>(
+          "Encoding random values", count, [&](const std::size_t i) { return random_values[i]; }
+      )
+  );
+}
+
+template <typename Lambda>
+void benchmark(
+    std::string_view name, const std::size_t count, const std::uint8_t *values_ptr, Lambda &&l
+) {
+  SCOPED_TIMER(name);
+
+  for (std::size_t i = 0; i < count; ++i) {
+    const auto [value, bytes_decoded] = l(values_ptr);
+    values_ptr += bytes_decoded;
+
+    do_not_optimize(value);
+  }
+}
+
+template <typename Int>
+void benchmark_rle(std::string_view name, const std::size_t count, const std::uint8_t *values_ptr) {
+  SCOPED_TIMER(name);
+
+  VarIntRunLengthDecoder<Int> decoder(values_ptr);
+  decoder.decode(count, [](const Int value) { do_not_optimize(value); });
+}
+
+template <typename Int>
+void benchmark_sve(std::string_view name, const std::size_t count, const std::uint8_t *values_ptr) {
+  SCOPED_TIMER(name);
+
+  VarIntStreamDecoder<Int> decoder(values_ptr, count);
+  decoder.decode(count, [](const Int value) { do_not_optimize(value); });
+}
+
+template <typename Lambda>
+void benchmark(
+    std::string_view name,
+    const std::size_t count,
+    const std::uint8_t *zero_values_ptr,
+    const std::uint8_t *max_values_ptr,
+    const std::uint8_t *random_values_ptr,
+    Lambda &&l
+) {
+  SCOPED_TIMER(name);
+
+  benchmark("Decoding zero", count, zero_values_ptr, std::forward<Lambda>(l));
+  benchmark("Decoding max values", count, max_values_ptr, std::forward<Lambda>(l));
+  benchmark("Decoding random values", count, random_values_ptr, std::forward<Lambda>(l));
+}
+
+template <typename Int>
+void benchmark_rle(
+    std::string_view name,
+    const std::size_t count,
+    const std::uint8_t *zero_values_ptr,
+    const std::uint8_t *max_values_ptr,
+    const std::uint8_t *random_values_ptr
+) {
+  SCOPED_TIMER(name);
+
+  benchmark_rle<Int>("Decoding zero values", count, zero_values_ptr);
+  benchmark_rle<Int>("Decoding max values", count, max_values_ptr);
+  benchmark_rle<Int>("Decoding random values", count, random_values_ptr);
+}
+
+template <typename Int>
+void benchmark_sve(
+    std::string_view name,
+    const std::size_t count,
+    const std::uint8_t *zero_values_ptr,
+    const std::uint8_t *max_values_ptr,
+    const std::uint8_t *random_values_ptr
+) {
+  SCOPED_TIMER(name);
+
+  benchmark_sve<Int>("Decoding zero values", count, zero_values_ptr);
+  benchmark_sve<Int>("Decoding max values", count, max_values_ptr);
+  benchmark_sve<Int>("Decoding random values", count, random_values_ptr);
+}
+
+template <typename Int> void run_benchmark(std::size_t count) {
+  std::vector<Int> random_values = generate_random_values<Int>(count);
+
+  const auto [encoded_zero_values, encoded_max_values, encoded_random_values] =
+      encode_values<Int>(count, random_values);
+
+  benchmark(
+      "Decoding: loop",
+      count,
+      encoded_zero_values.get(),
+      encoded_max_values.get(),
+      encoded_random_values.get(),
+      [](const std::uint8_t *ptr) { return varint_decode_general<Int>(ptr); }
+  );
+
+  benchmark(
+      "Decoding: unrolled + intrinsic",
+      count,
+      encoded_zero_values.get(),
+      encoded_max_values.get(),
+      encoded_random_values.get(),
+      [](const std::uint8_t *ptr) { return varint_decode<Int>(ptr); }
+  );
+
+  std::vector<std::make_signed_t<Int>> random_signed_values =
+      generate_random_values<std::make_signed_t<Int>>(count);
+
+  const auto [encoded_zero_signed_values, encoded_max_signed_values, encoded_random_signed_values] =
+      encode_signed_values<std::make_signed_t<Int>>(count, random_signed_values);
+
+  benchmark(
+      "Decoding signed: loop",
+      count,
+      encoded_zero_signed_values.get(),
+      encoded_max_signed_values.get(),
+      encoded_random_signed_values.get(),
+      [](const std::uint8_t *ptr) {
+        return signed_varint_decode_general<std::make_signed_t<Int>>(ptr);
+      }
+  );
+
+  benchmark(
+      "Decoding signed: unrolled + intrinsic",
+      count,
+      encoded_zero_signed_values.get(),
+      encoded_max_signed_values.get(),
+      encoded_random_signed_values.get(),
+      [](const std::uint8_t *ptr) { return signed_varint_decode<std::make_signed_t<Int>>(ptr); }
+  );
+
+  const auto [rl_encoded_zero_values, rl_encoded_max_values, rl_encoded_random_values] =
+      rl_encode_values<Int>(count, random_values);
+
+  benchmark_rle<Int>(
+      "Decoding run-length",
+      count,
+      rl_encoded_zero_values.get(),
+      rl_encoded_max_values.get(),
+      rl_encoded_random_values.get()
+  );
+
+  if constexpr (sizeof(Int) == 4) {
+    const auto [sv_encoded_zero_values, sv_encoded_max_values, sv_encoded_random_values] =
+        sv_encode_values<Int>(count, random_values);
+
+    benchmark_sve<Int>(
+        "Decoding stream",
+        count,
+        sv_encoded_zero_values.get(),
+        sv_encoded_max_values.get(),
+        sv_encoded_random_values.get()
+    );
+  }
+}
+
+int main(int argc, char *argv[]) {
+  // Parse CLI arguments
+  IntType int_type = IntType::INT_32;
+  std::size_t count = 100000000;
+
+  CLI::App app("Shared-memory variable length codec benchmark");
+  app.add_option("-n", count, "The amount of numbers to encode and decode")
+      ->check(CLI::NonNegativeNumber)
+      ->default_val(count);
+  app.add_option("-i,--int", int_type)
+      ->transform(CLI::CheckedTransformer(get_int_types()).description(""))
+      ->description(R"(Select a int type. The options are:
+                      - int32
+                      - int64
+        )");
+  CLI11_PARSE(app, argc, argv);
+
+  // Run Benchmark
+  LOG << "Running the benchmark...";
+  GLOBAL_TIMER.reset();
+
+  switch (int_type) {
+  case IntType::INT_32:
+    run_benchmark<std::uint32_t>(count);
+    break;
+  case IntType::INT_64:
+    run_benchmark<std::uint64_t>(count);
+    break;
+  };
+
+  STOP_TIMER();
+
+  // Print the result summary
+  LOG;
+  cio::print_delimiter("Result Summary");
+  LOG << "Encoded and decoded " << count << " integers.";
+  LOG;
+  Timer::global().print_human_readable(std::cout);
+}
diff --git a/apps/io/parhip_parser.cc b/apps/io/parhip_parser.cc
new file mode 100644
index 00000000..06ad71e9
--- /dev/null
+++ b/apps/io/parhip_parser.cc
@@ -0,0 +1,239 @@
+/*******************************************************************************
+ * Sequential ParHiP parser.
+ *
+ * @file:   parhip_parser.cc
+ * @author: Daniel Salwasser
+ * @date:   15.02.2024
+ ******************************************************************************/
+#include "apps/io/parhip_parser.h"
+
+#include <array>
+#include <cstdint>
+#include <fstream>
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <tbb/parallel_for.h>
+
+#include "kaminpar-common/logger.h"
+
+namespace kaminpar::shm::io::parhip {
+
+constexpr std::uint64_t kParhipHeaderSize = 3 * sizeof(std::uint64_t);
+
+struct ParhipHeader {
+  bool has_edge_weights;
+  bool has_node_weights;
+  bool has_64_bit_edge_id;
+  bool has_64_bit_node_id;
+  bool has_64_bit_node_weight;
+  bool has_64_bit_edge_weight;
+  std::uint64_t num_nodes;
+  std::uint64_t num_edges;
+};
+
+ParhipHeader parse_header(std::array<std::uint64_t, 3> header) {
+  const std::uint64_t version = header[0];
+  return {
+      (version & 1) == 0,
+      (version & 2) == 0,
+      (version & 4) == 0,
+      (version & 8) == 0,
+      (version & 16) == 0,
+      (version & 32) == 0,
+      header[1],
+      header[2]
+  };
+}
+
+void validate_ids(ParhipHeader header) {
+  if (header.has_64_bit_edge_id) {
+    if (sizeof(EdgeID) != 8) {
+      LOG_ERROR << "The stored graph uses 64-Bit EdgeIDs but this build uses "
+                << (sizeof(EdgeID) * 8) << "-Bit EdgeIDs.";
+      std::exit(1);
+    }
+  } else if (sizeof(EdgeID) != 4) {
+    LOG_ERROR << "The stored graph uses 32-Bit EdgeIDs but this build uses " << (sizeof(EdgeID) * 8)
+              << "-Bit EdgeIDs.";
+    std::exit(1);
+  }
+
+  if (header.has_64_bit_node_id) {
+    if (sizeof(NodeID) != 8) {
+      LOG_ERROR << "The stored graph uses 64-Bit NodeIDs but this build uses "
+                << (sizeof(NodeID) * 8) << "-Bit NodeIDs.";
+      std::exit(1);
+    }
+  } else if (sizeof(NodeID) != 4) {
+    LOG_ERROR << "The stored graph uses 32-Bit EdgeIDs but this build uses " << (sizeof(NodeID) * 8)
+              << "-Bit NodeIDs.";
+    std::exit(1);
+  }
+
+  if (header.has_64_bit_node_weight) {
+    if (sizeof(NodeWeight) != 8) {
+      LOG_ERROR << "The stored graph uses 64-Bit node node weights but this build uses "
+                << (sizeof(NodeWeight) * 8) << "-Bit node weights.";
+      std::exit(1);
+    }
+  } else if (sizeof(NodeWeight) != 4) {
+    LOG_ERROR << "The stored graph uses 32-Bit node weights but this build uses "
+              << (sizeof(NodeWeight) * 8) << "-Bit node weights.";
+    std::exit(1);
+  }
+
+  if (header.has_64_bit_edge_weight) {
+    if (sizeof(EdgeWeight) != 8) {
+      LOG_ERROR << "The stored graph uses 64-Bit node edge weights but this build uses "
+                << (sizeof(EdgeWeight) * 8) << "-Bit edge weights.";
+      std::exit(1);
+    }
+  } else if (sizeof(NodeWeight) != 4) {
+    LOG_ERROR << "The stored graph uses 32-Bit edge weights but this build uses "
+              << (sizeof(EdgeWeight) * 8) << "-Bit edge weights.";
+    std::exit(1);
+  }
+}
+
+CSRGraph read_graph(
+    std::ifstream &in,
+    const std::uint64_t n,
+    const std::uint64_t m,
+    const bool weighted_nodes,
+    const bool weighted_edges,
+    const bool sorted
+) {
+  StaticArray<EdgeID> nodes(n + 1);
+  in.read(reinterpret_cast<char *>(nodes.data()), (n + 1) * sizeof(EdgeID));
+
+  const EdgeID nodes_offset = kParhipHeaderSize + (n + 1) * sizeof(EdgeID);
+  tbb::parallel_for(tbb::blocked_range<NodeID>(0, n + 1), [&](const auto &r) {
+    for (NodeID u = r.begin(); u != r.end(); ++u) {
+      nodes[u] = (nodes[u] - nodes_offset) / sizeof(NodeID);
+    }
+  });
+
+  StaticArray<NodeID> edges(m);
+  in.read(reinterpret_cast<char *>(edges.data()), m * sizeof(NodeID));
+
+  StaticArray<NodeWeight> node_weights;
+  if (weighted_nodes) {
+    node_weights.resize(n);
+    in.read(reinterpret_cast<char *>(node_weights.data()), n * sizeof(NodeWeight));
+  }
+
+  StaticArray<EdgeWeight> edge_weights;
+  if (weighted_edges) {
+    edge_weights.resize(m);
+    in.read(reinterpret_cast<char *>(edge_weights.data()), m * sizeof(EdgeWeight));
+  }
+
+  CSRGraph graph = CSRGraph(
+      std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), sorted
+  );
+
+  return graph;
+}
+
+CSRGraph csr_read(const std::string &filename, const bool sorted) {
+  std::ifstream in(filename, std::ios::binary);
+  if (!in.is_open()) {
+    LOG_ERROR << "Cannot read graph stored at " << filename << ".";
+    std::exit(1);
+  }
+
+  std::array<std::uint64_t, 3> raw_header;
+  in.read(reinterpret_cast<char *>(raw_header.data()), kParhipHeaderSize);
+
+  ParhipHeader header = parse_header(raw_header);
+  validate_ids(header);
+
+  return read_graph(
+      in,
+      header.num_nodes,
+      header.num_edges,
+      header.has_node_weights,
+      header.has_edge_weights,
+      sorted
+  );
+}
+
+CompressedGraph compressed_read(const std::string &filename, const bool sorted) {
+  const int file = open(filename.c_str(), O_RDONLY);
+  if (file < 0) {
+    LOG_ERROR << "Cannot read graph stored at " << filename << ".";
+    std::exit(1);
+  }
+
+  struct stat file_info {};
+  if (fstat(file, &file_info) < 0) {
+    LOG_ERROR << "Cannot read graph stored at " << filename << ".";
+    close(file);
+    std::exit(1);
+  }
+
+  const std::size_t length = static_cast<std::size_t>(file_info.st_size);
+
+  std::uint8_t *data =
+      static_cast<std::uint8_t *>(mmap(nullptr, length, PROT_READ, MAP_PRIVATE, file, 0));
+  if (data == MAP_FAILED) {
+    LOG_ERROR << "Cannot read graph stored at " << filename << ".";
+    close(file);
+    std::exit(1);
+  }
+
+  std::array<std::uint64_t, 3> raw_header;
+  std::memcpy(raw_header.data(), data, kParhipHeaderSize);
+  data += kParhipHeaderSize;
+
+  const ParhipHeader header = parse_header(raw_header);
+  validate_ids(header);
+
+  CompressedGraphBuilder builder;
+  builder.init(
+      header.num_nodes, header.num_edges, header.has_node_weights, header.has_edge_weights, sorted
+  );
+
+  const EdgeID *nodes = reinterpret_cast<const EdgeID *>(data);
+  data += (header.num_nodes + 1) * sizeof(EdgeID);
+
+  const NodeID *edges = reinterpret_cast<const NodeID *>(data);
+  data += header.num_edges + sizeof(NodeID);
+
+  const NodeWeight *node_weights = reinterpret_cast<const NodeWeight *>(data);
+  data += header.num_nodes + sizeof(NodeWeight);
+
+  const EdgeWeight *edge_weights = reinterpret_cast<const EdgeWeight *>(data);
+
+  const EdgeID nodes_offset = kParhipHeaderSize + (header.num_nodes + 1) * sizeof(EdgeID);
+  std::vector<std::pair<NodeID, EdgeWeight>> neighbourhood;
+  for (NodeID u = 0; u < header.num_nodes; ++u) {
+    const EdgeID offset = (nodes[u] - nodes_offset) / sizeof(NodeID);
+    const EdgeID next_offset = (nodes[u + 1] - nodes_offset) / sizeof(NodeID);
+
+    const NodeID degree = static_cast<NodeID>(next_offset - offset);
+    for (NodeID i = 0; i < degree; ++i) {
+      const EdgeID e = offset + i;
+
+      const NodeID adjacent_node = edges[e];
+      const EdgeWeight edge_weight = header.has_edge_weights ? edge_weights[e] : 1;
+
+      neighbourhood.push_back(std::make_pair(adjacent_node, edge_weight));
+    }
+
+    builder.add_node(u, neighbourhood);
+    if (header.has_node_weights) {
+      builder.set_node_weight(u, node_weights[u]);
+    }
+
+    neighbourhood.clear();
+  }
+
+  munmap(data, length);
+  close(file);
+  return builder.build();
+}
+
+} // namespace kaminpar::shm::io::parhip
diff --git a/apps/io/parhip_parser.h b/apps/io/parhip_parser.h
new file mode 100644
index 00000000..91a46eff
--- /dev/null
+++ b/apps/io/parhip_parser.h
@@ -0,0 +1,21 @@
+/*******************************************************************************
+ * Sequential ParHiP parser.
+ *
+ * @file:   parhip_parser.h
+ * @author: Daniel Salwasser
+ * @date:   15.02.2024
+ ******************************************************************************/
+#pragma once
+
+#include <string>
+
+#include "kaminpar-shm/datastructures/compressed_graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
+
+namespace kaminpar::shm::io::parhip {
+
+CSRGraph csr_read(const std::string &filename, const bool sorted);
+
+CompressedGraph compressed_read(const std::string &filename, const bool sorted);
+
+} // namespace kaminpar::shm::io::parhip
diff --git a/apps/io/shm_compressed_graph_binary.cc b/apps/io/shm_compressed_graph_binary.cc
new file mode 100644
index 00000000..b1c3ad23
--- /dev/null
+++ b/apps/io/shm_compressed_graph_binary.cc
@@ -0,0 +1,268 @@
+/*******************************************************************************
+ * IO utilities for the compressed graph binary.
+ *
+ * @file:   shm_compressed_graph_binary.cc
+ * @author: Daniel Salwasser
+ * @date:   12.12.2023
+ ******************************************************************************/
+#include "apps/io/shm_compressed_graph_binary.h"
+
+#include <filesystem>
+#include <fstream>
+
+#include "kaminpar-common/logger.h"
+
+namespace kaminpar::shm::io::compressed_binary {
+
+template <typename T> static void write_int(std::ofstream &out, const T id) {
+  out.write(reinterpret_cast<const char *>(&id), sizeof(T));
+}
+
+template <typename T>
+static void write_compact_static_array(std::ofstream &out, const CompactStaticArray<T> &array) {
+  write_int(out, array.byte_width());
+  write_int(out, array.allocated_size());
+  out.write(reinterpret_cast<const char *>(array.data()), array.allocated_size());
+}
+
+template <typename T>
+static void write_static_array(std::ofstream &out, const StaticArray<T> &static_array) {
+  out.write(reinterpret_cast<const char *>(static_array.data()), static_array.size() * sizeof(T));
+}
+
+void write(const std::string &filename, const CompressedGraph &graph) {
+  std::ofstream out(filename, std::ios::binary);
+
+  write_int(out, kMagicNumber);
+
+  write_int(out, static_cast<std::uint8_t>(sizeof(CompressedGraph::NodeID)));
+  write_int(out, static_cast<std::uint8_t>(sizeof(CompressedGraph::EdgeID)));
+  write_int(out, static_cast<std::uint8_t>(sizeof(CompressedGraph::NodeWeight)));
+  write_int(out, static_cast<std::uint8_t>(sizeof(CompressedGraph::EdgeWeight)));
+
+  write_int(out, static_cast<std::uint8_t>(CompressedGraph::kHighDegreeEncoding));
+  write_int(out, CompressedGraph::kHighDegreeThreshold);
+  write_int(out, CompressedGraph::kHighDegreePartLength);
+  write_int(out, static_cast<std::uint8_t>(CompressedGraph::kIntervalEncoding));
+  write_int(out, CompressedGraph::kIntervalLengthTreshold);
+  write_int(out, static_cast<std::uint8_t>(CompressedGraph::kRunLengthEncoding));
+  write_int(out, static_cast<std::uint8_t>(CompressedGraph::kStreamEncoding));
+  write_int(out, static_cast<std::uint8_t>(CompressedGraph::kIsolatedNodesSeparation));
+
+  write_int(out, graph.n());
+  write_int(out, graph.m());
+  write_int(out, graph.max_degree());
+  write_int(out, static_cast<std::uint8_t>(graph.sorted()));
+  write_int(out, static_cast<std::uint8_t>(graph.node_weighted()));
+  write_int(out, static_cast<std::uint8_t>(graph.edge_weighted()));
+
+  write_int(out, graph.high_degree_count());
+  write_int(out, graph.part_count());
+  write_int(out, graph.interval_count());
+
+  write_compact_static_array(out, graph.raw_nodes());
+
+  write_int(out, graph.raw_compressed_edges().size());
+  write_static_array(out, graph.raw_compressed_edges());
+
+  if (graph.node_weighted()) {
+    write_static_array(out, graph.raw_node_weights());
+  }
+
+  if (graph.edge_weighted()) {
+    write_static_array(out, graph.raw_edge_weights());
+  }
+}
+
+template <typename T> static T read_int(std::ifstream &in) {
+  T t;
+  in.read(reinterpret_cast<char *>(&t), sizeof(T));
+  return t;
+}
+
+template <typename T> static CompactStaticArray<T> read_compact_static_array(std::ifstream &in) {
+  std::uint8_t byte_width = read_int<std::uint8_t>(in);
+  std::size_t allocated_size = read_int<std::size_t>(in);
+
+  auto data = std::make_unique<std::uint8_t[]>(allocated_size);
+  in.read(reinterpret_cast<char *>(data.get()), allocated_size);
+  return CompactStaticArray<T>(byte_width, allocated_size, std::move(data));
+}
+
+template <typename T>
+static StaticArray<T> read_static_array(std::ifstream &in, const std::size_t size) {
+  T *ptr = static_cast<T *>(std::malloc(sizeof(T) * size));
+  in.read(reinterpret_cast<char *>(ptr), sizeof(T) * size);
+  return StaticArray<T>(ptr, size);
+}
+
+CompressedGraph read(const std::string &filename) {
+  using NodeID = CompressedGraph::NodeID;
+  using EdgeID = CompressedGraph::EdgeID;
+  using NodeWeight = CompressedGraph::NodeWeight;
+  using EdgeWeight = CompressedGraph::EdgeWeight;
+
+  std::ifstream in(filename, std::ios::binary);
+
+  if (kMagicNumber != read_int<std::uint64_t>(in)) {
+    LOG_ERROR << "The magic number of the file is not correct!";
+    std::exit(1);
+  }
+
+  std::uint8_t stored_node_id_size = read_int<std::uint8_t>(in);
+  if (stored_node_id_size != sizeof(NodeID)) {
+    LOG_ERROR << "The stored compressed graph uses " << (stored_node_id_size * 8)
+              << "-Bit NodeIDs but this build uses " << (sizeof(NodeID) * 8) << "-Bit NodeIDs.";
+    std::exit(1);
+  }
+
+  std::uint8_t stored_edge_id_size = read_int<std::uint8_t>(in);
+  if (stored_edge_id_size != sizeof(EdgeID)) {
+    LOG_ERROR << "The stored compressed graph uses " << (stored_edge_id_size * 8)
+              << "-Bit EdgeIDs but this build uses " << (sizeof(EdgeID) * 8) << "-Bit EdgeIDs.";
+    std::exit(1);
+  }
+
+  std::uint8_t stored_node_weight_size = read_int<std::uint8_t>(in);
+  if (stored_node_weight_size != sizeof(NodeWeight)) {
+    LOG_ERROR << "The stored compressed graph uses " << (stored_node_weight_size * 8)
+              << "-Bit NodeWeights but this build uses " << (sizeof(NodeWeight) * 8)
+              << "-Bit NodeWeights.";
+    std::exit(1);
+  }
+
+  std::uint8_t stored_edge_weight_size = read_int<std::uint8_t>(in);
+  if (stored_edge_weight_size != sizeof(EdgeWeight)) {
+    LOG_ERROR << "The stored compressed graph uses " << (stored_edge_weight_size * 8)
+              << "-Bit EdgeWeights but this build uses " << (sizeof(EdgeWeight) * 8)
+              << "-Bit EdgeWeights.";
+    std::exit(1);
+  }
+
+  bool high_degree_encoding = static_cast<bool>(read_int<std::uint8_t>(in));
+  if (high_degree_encoding != CompressedGraph::kHighDegreeEncoding) {
+    if (high_degree_encoding) {
+      LOG_ERROR << "The stored compressed graph uses high degree encoding but this build does not.";
+    } else {
+      LOG_ERROR
+          << "The stored compressed graph does not use high degree encoding but this build does.";
+    }
+    std::exit(1);
+  }
+
+  NodeID high_degree_threshold = read_int<NodeID>(in);
+  if (high_degree_threshold != CompressedGraph::kHighDegreeThreshold) {
+    LOG_ERROR << "The stored compressed graph uses " << high_degree_threshold
+              << " as the high degree threshold but this build uses "
+              << (CompressedGraph::kHighDegreeThreshold) << " as the high degree threshold.";
+    std::exit(1);
+  }
+
+  NodeID high_degree_part_length = read_int<NodeID>(in);
+  if (high_degree_part_length != CompressedGraph::kHighDegreePartLength) {
+    LOG_ERROR << "The stored compressed graph uses " << high_degree_part_length
+              << " as the high degree part length but this build uses "
+              << (CompressedGraph::kHighDegreePartLength) << " as the high degree part length.";
+    std::exit(1);
+  }
+
+  bool interval_encoding = static_cast<bool>(read_int<std::uint8_t>(in));
+  if (interval_encoding != CompressedGraph::kIntervalEncoding) {
+    if (interval_encoding) {
+      LOG_ERROR << "The stored compressed graph uses interval encoding but this build does not.";
+    } else {
+      LOG_ERROR
+          << "The stored compressed graph does not use interval encoding but this build does.";
+    }
+    std::exit(1);
+  }
+
+  NodeID interval_length_threshold = read_int<NodeID>(in);
+  if (interval_length_threshold != CompressedGraph::kIntervalLengthTreshold) {
+    LOG_ERROR << "The stored compressed graph uses " << interval_length_threshold
+              << " as the interval length threshold but this build uses "
+              << (CompressedGraph::kIntervalLengthTreshold) << " as the interval length threshold.";
+    std::exit(1);
+  }
+
+  bool run_length_encoding = static_cast<bool>(read_int<std::uint8_t>(in));
+  if (run_length_encoding != CompressedGraph::kRunLengthEncoding) {
+    if (run_length_encoding) {
+      LOG_ERROR << "The stored compressed graph uses run-length encoding but this build does not.";
+    } else {
+      LOG_ERROR
+          << "The stored compressed graph does not use run-length encoding but this build does.";
+    }
+    std::exit(1);
+  }
+
+  bool stream_encoding = static_cast<bool>(read_int<std::uint8_t>(in));
+  if (stream_encoding != CompressedGraph::kStreamEncoding) {
+    if (stream_encoding) {
+      LOG_ERROR << "The stored compressed graph uses stream encoding but this build does not.";
+    } else {
+      LOG_ERROR << "The stored compressed graph does not use stream encoding but this build does.";
+    }
+    std::exit(1);
+  }
+
+  bool isolated_nodes_separation = static_cast<bool>(read_int<std::uint8_t>(in));
+  if (isolated_nodes_separation != CompressedGraph::kIsolatedNodesSeparation) {
+    if (isolated_nodes_separation) {
+      LOG_ERROR
+          << "The stored compressed graph uses isolated nodes separation but this build does not.";
+    } else {
+      LOG_ERROR << "The stored compressed graph does not use isolated nodes separation but this "
+                   "build does.";
+    }
+    std::exit(1);
+  }
+
+  NodeID n = read_int<NodeID>(in);
+  EdgeID m = read_int<EdgeID>(in);
+  NodeID max_degree = read_int<NodeID>(in);
+  bool sorted = static_cast<bool>(read_int<std::uint8_t>(in));
+  bool is_node_weighted = static_cast<bool>(read_int<std::uint8_t>(in));
+  bool is_edge_weighted = static_cast<bool>(read_int<std::uint8_t>(in));
+
+  std::size_t high_degree_count = read_int<std::size_t>(in);
+  std::size_t part_count = read_int<std::size_t>(in);
+  std::size_t interval_count = read_int<std::size_t>(in);
+
+  CompactStaticArray<EdgeID> nodes = read_compact_static_array<EdgeID>(in);
+
+  std::size_t compressed_edges_size = read_int<std::size_t>(in);
+  StaticArray<std::uint8_t> compressed_edges =
+      read_static_array<std::uint8_t>(in, compressed_edges_size);
+  StaticArray<NodeWeight> node_weights =
+
+      is_node_weighted ? read_static_array<NodeWeight>(in, n) : StaticArray<NodeWeight>();
+
+  StaticArray<EdgeWeight> edge_weights =
+      is_edge_weighted ? read_static_array<EdgeWeight>(in, m) : StaticArray<EdgeWeight>();
+
+  return CompressedGraph(
+      std::move(nodes),
+      std::move(compressed_edges),
+      std::move(node_weights),
+      std::move(edge_weights),
+      m,
+      max_degree,
+      sorted,
+      high_degree_count,
+      part_count,
+      interval_count
+  );
+}
+
+bool is_compressed(const std::string &filename) {
+  const auto size = std::filesystem::file_size(filename);
+  if (size < sizeof(kMagicNumber)) {
+    return false;
+  }
+
+  std::ifstream in(filename, std::ios::binary);
+  return kMagicNumber == read_int<std::uint64_t>(in);
+}
+
+} // namespace kaminpar::shm::io::compressed_binary
diff --git a/apps/io/shm_compressed_graph_binary.h b/apps/io/shm_compressed_graph_binary.h
new file mode 100644
index 00000000..0362e3d2
--- /dev/null
+++ b/apps/io/shm_compressed_graph_binary.h
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ * IO utilities for the compressed graph binary.
+ *
+ * @file:   shm_compressed_graph_binary.h
+ * @author: Daniel Salwasser
+ * @date:   12.12.2023
+ ******************************************************************************/
+#pragma once
+
+#include <string>
+
+#include "kaminpar-shm/datastructures/compressed_graph.h"
+
+namespace kaminpar::shm::io::compressed_binary {
+
+//! Magic number to identify a compressed graph binary file.
+constexpr std::uint64_t kMagicNumber = 0x434F4D5052455353;
+
+/*!
+ * Writes a graph to a file as a compressed graph binary.
+ *
+ * @param filename The name of the file to write to.
+ * @param graph The compressed graph to write.
+ */
+void write(const std::string &filename, const CompressedGraph &graph);
+
+/*!
+ * Reads the graph from a compressed graph binary file. If the paramters of the compressed graph
+ * stored in the file do not match with this build, exit is called.
+ *
+ * @param filename The name of the file to read from.
+ * @return The read compressed graph.
+ */
+CompressedGraph read(const std::string &filename);
+
+/*!
+ * Checks whether a graph is stored in compressed format.
+ *
+ * @param filename The name of the file to check.
+ * @return Whether the graph is stored in compressed format.
+ */
+bool is_compressed(const std::string &filename);
+
+} // namespace kaminpar::shm::io::compressed_binary
diff --git a/apps/io/shm_io.cc b/apps/io/shm_io.cc
index 95b392a1..36b9f251 100644
--- a/apps/io/shm_io.cc
+++ b/apps/io/shm_io.cc
@@ -9,30 +9,141 @@
 
 #include <fstream>
 
+#include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/kaminpar.h"
 
 #include "kaminpar-common/datastructures/static_array.h"
 #include "kaminpar-common/logger.h"
 
 #include "apps/io/metis_parser.h"
+#include "apps/io/parhip_parser.h"
+#include "apps/io/shm_compressed_graph_binary.h"
+#include "apps/io/shm_input_validator.h"
 
 namespace kaminpar::shm::io {
 //
 // Public Metis functions
 //
 namespace metis {
+
+template <bool checked> void check_format(kaminpar::io::metis::Format format) {
+  if constexpr (checked) {
+    if (format.number_of_nodes >= static_cast<std::uint64_t>(std::numeric_limits<NodeID>::max())) {
+      LOG_ERROR << "number of nodes is too large for the node ID type";
+      std::exit(1);
+    }
+    if (format.number_of_edges >= static_cast<std::uint64_t>(std::numeric_limits<EdgeID>::max())) {
+      LOG_ERROR << "number of edges is too large for the edge ID type";
+      std::exit(1);
+    }
+    if (format.number_of_edges > (format.number_of_nodes * (format.number_of_nodes - 1) / 2)) {
+      LOG_ERROR << "specified number of edges is impossibly large";
+      std::exit(1);
+    }
+  } else {
+    KASSERT(
+        format.number_of_nodes <= static_cast<std::uint64_t>(std::numeric_limits<NodeID>::max()),
+        "number of nodes is too large for the node ID type"
+    );
+    KASSERT(
+        format.number_of_edges <= static_cast<std::uint64_t>(std::numeric_limits<EdgeID>::max()),
+        "number of edges is too large for the edge ID type"
+    );
+    KASSERT(
+        format.number_of_edges <= (format.number_of_nodes * (format.number_of_nodes - 1)) / 2,
+        "specified number of edges is impossibly large"
+    );
+  }
+}
+
+template <bool checked> void check_node_weight(const std::uint64_t weight) {
+  if constexpr (checked) {
+    if (weight > static_cast<std::uint64_t>(std::numeric_limits<NodeWeight>::max())) {
+      LOG_ERROR << "node weight is too large for the node weight type";
+      std::exit(1);
+    }
+    if (weight <= 0) {
+      LOG_ERROR << "zero node weights are not supported";
+      std::exit(1);
+    }
+  } else {
+    KASSERT(
+        weight <= static_cast<std::uint64_t>(std::numeric_limits<NodeWeight>::max()),
+        "node weight is too large for the node weight type"
+    );
+    KASSERT(weight > 0u, "zero node weights are not supported");
+  }
+}
+
 template <bool checked>
-void read(
-    const std::string &filename,
-    StaticArray<EdgeID> &nodes,
-    StaticArray<NodeID> &edges,
-    StaticArray<NodeWeight> &node_weights,
-    StaticArray<EdgeWeight> &edge_weights
+void check_edge(
+    const std::uint64_t node_count,
+    const std::uint64_t u,
+    const std::uint64_t weight,
+    const std::uint64_t v
 ) {
+  if constexpr (checked) {
+    if (weight > static_cast<std::uint64_t>(std::numeric_limits<EdgeWeight>::max())) {
+      LOG_ERROR << "edge weight is too large for the edge weight type";
+      std::exit(1);
+    }
+    if (weight <= 0) {
+      LOG_ERROR << "zero edge weights are not supported";
+      std::exit(1);
+    }
+    if (v + 1 >= node_count) {
+      LOG_ERROR << "neighbor " << v + 1 << " of nodes " << u + 1 << " is out of bounds";
+      std::exit(1);
+    }
+    if (v + 1 == u) {
+      LOG_ERROR << "detected self-loop on node " << v + 1 << ", which is not allowed";
+      std::exit(1);
+    }
+  } else {
+    KASSERT(
+        weight <= static_cast<std::uint64_t>(std::numeric_limits<EdgeWeight>::max()),
+        "edge weight is too large for the edge weight type"
+    );
+    KASSERT(weight > 0u, "zero edge weights are not supported");
+    KASSERT(v + 1 < node_count, "neighbor out of bounds");
+    KASSERT(u != v + 1, "detected illegal self-loop");
+  }
+}
+
+template <bool checked>
+void check_total_weight(std::int64_t total_node_weight, std::int64_t total_edge_weight) {
+  if constexpr (checked) {
+    if (total_node_weight > static_cast<std::int64_t>(std::numeric_limits<NodeWeight>::max())) {
+      LOG_ERROR << "total node weight does not fit into the node weight type";
+      std::exit(1);
+    }
+    if (total_edge_weight > static_cast<std::int64_t>(std::numeric_limits<EdgeWeight>::max())) {
+      LOG_ERROR << "total edge weight does not fit into the edge weight type";
+      std::exit(1);
+    }
+  } else {
+    KASSERT(
+        total_node_weight <= static_cast<std::int64_t>(std::numeric_limits<NodeWeight>::max()),
+        "total node weight does not fit into the node weight type"
+    );
+    KASSERT(
+        total_edge_weight <= static_cast<std::int64_t>(std::numeric_limits<EdgeWeight>::max()),
+        "total edge weight does not fit into the edge weight type"
+    );
+  }
+}
+
+template <bool checked> CSRGraph csr_read(const std::string &filename, const bool sorted) {
   using namespace kaminpar::io::metis;
 
+  RECORD("nodes") StaticArray<EdgeID> nodes;
+  RECORD("edges") StaticArray<NodeID> edges;
+  RECORD("node_weights") StaticArray<NodeWeight> node_weights;
+  RECORD("edge_weights") StaticArray<EdgeWeight> edge_weights;
+
   bool store_node_weights = false;
   bool store_edge_weights = false;
+
   std::int64_t total_node_weight = 0;
   std::int64_t total_edge_weight = 0;
 
@@ -42,134 +153,50 @@ void read(
   parse<false>(
       filename,
       [&](const auto &format) {
-        if constexpr (checked) {
-          if (format.number_of_nodes >=
-              static_cast<std::uint64_t>(std::numeric_limits<NodeID>::max())) {
-            LOG_ERROR << "number of nodes is too large for the node ID type";
-            std::exit(1);
-          }
-          if (format.number_of_edges >=
-              static_cast<std::uint64_t>(std::numeric_limits<EdgeID>::max())) {
-            LOG_ERROR << "number of edges is too large for the edge ID type";
-            std::exit(1);
-          }
-          if (format.number_of_edges >
-              (format.number_of_nodes * (format.number_of_nodes - 1) / 2)) {
-            LOG_ERROR << "specified number of edges is impossibly large";
-            std::exit(1);
-          }
-        } else {
-          KASSERT(
-              format.number_of_nodes <=
-                  static_cast<std::uint64_t>(std::numeric_limits<NodeID>::max()),
-              "number of nodes is too large for the node ID type"
-          );
-          KASSERT(
-              format.number_of_edges <=
-                  static_cast<std::uint64_t>(std::numeric_limits<EdgeID>::max()),
-              "number of edges is too large for the edge ID type"
-          );
-          KASSERT(
-              format.number_of_edges <= (format.number_of_nodes * (format.number_of_nodes - 1)) / 2,
-              "specified number of edges is impossibly large"
-          );
-        }
+        check_format<checked>(format);
 
         store_node_weights = format.has_node_weights;
         store_edge_weights = format.has_edge_weights;
+
         nodes.resize(format.number_of_nodes + 1);
         edges.resize(format.number_of_edges * 2);
+
         if (store_node_weights) {
           node_weights.resize(format.number_of_nodes);
         }
+
         if (store_edge_weights) {
           edge_weights.resize(format.number_of_edges * 2);
         }
       },
       [&](const std::uint64_t weight) {
-        if constexpr (checked) {
-          if (weight > static_cast<std::uint64_t>(std::numeric_limits<NodeWeight>::max())) {
-            LOG_ERROR << "node weight is too large for the node weight type";
-            std::exit(1);
-          }
-          if (weight <= 0) {
-            LOG_ERROR << "zero node weights are not supported";
-            std::exit(1);
-          }
-        } else {
-          KASSERT(
-              weight <= static_cast<std::uint64_t>(std::numeric_limits<NodeWeight>::max()),
-              "node weight is too large for the node weight type"
-          );
-          KASSERT(weight > 0u, "zero node weights are not supported");
-        }
+        check_node_weight<checked>(weight);
 
+        total_node_weight += weight;
         if (store_node_weights) {
           node_weights[u] = static_cast<NodeWeight>(weight);
         }
+
         nodes[u] = e;
-        total_node_weight += weight;
-        ++u;
+        u += 1;
       },
       [&](const std::uint64_t weight, const std::uint64_t v) {
-        if constexpr (checked) {
-          if (weight > static_cast<std::uint64_t>(std::numeric_limits<EdgeWeight>::max())) {
-            LOG_ERROR << "edge weight is too large for the edge weight type";
-            std::exit(1);
-          }
-          if (weight <= 0) {
-            LOG_ERROR << "zero edge weights are not supported";
-            std::exit(1);
-          }
-          if (v + 1 >= nodes.size()) {
-            LOG_ERROR << "neighbor " << v + 1 << " of nodes " << u + 1 << " is out of bounds";
-            std::exit(1);
-          }
-          if (v + 1 == u) {
-            LOG_ERROR << "detected self-loop on node " << v + 1 << ", which is not allowed";
-            std::exit(1);
-          }
-        } else {
-          KASSERT(
-              weight <= static_cast<std::uint64_t>(std::numeric_limits<EdgeWeight>::max()),
-              "edge weight is too large for the edge weight type"
-          );
-          KASSERT(weight > 0u, "zero edge weights are not supported");
-          KASSERT(v + 1 < nodes.size(), "neighbor out of bounds");
-          KASSERT(u != v + 1, "detected illegal self-loop");
-        }
+        check_edge<checked>(nodes.size(), u, weight, v);
 
+        total_edge_weight += weight;
         if (store_edge_weights) {
           edge_weights[e] = static_cast<EdgeWeight>(weight);
         }
+
         edges[e] = static_cast<NodeID>(v);
-        total_edge_weight += weight;
-        ++e;
+        e += 1;
       }
   );
   nodes[u] = e;
 
-  if constexpr (checked) {
-    if (total_node_weight > static_cast<std::int64_t>(std::numeric_limits<NodeWeight>::max())) {
-      LOG_ERROR << "total node weight does not fit into the node weight type";
-      std::exit(1);
-    }
-    if (total_edge_weight > static_cast<std::int64_t>(std::numeric_limits<EdgeWeight>::max())) {
-      LOG_ERROR << "total edge weight does not fit into the edge weight type";
-      std::exit(1);
-    }
-  } else {
-    KASSERT(
-        total_node_weight <= static_cast<std::int64_t>(std::numeric_limits<NodeWeight>::max()),
-        "total node weight does not fit into the node weight type"
-    );
-    KASSERT(
-        total_edge_weight <= static_cast<std::int64_t>(std::numeric_limits<EdgeWeight>::max()),
-        "total edge weight does not fit into the edge weight type"
-    );
-  }
+  check_total_weight<checked>(total_node_weight, total_edge_weight);
 
-  // only keep weights if the graph is really weighted
+  // Only keep weights if the graph is really weighted.
   const bool unit_node_weights = static_cast<NodeID>(total_node_weight + 1) == nodes.size();
   if (unit_node_weights) {
     node_weights.free();
@@ -179,25 +206,208 @@ void read(
   if (unit_edge_weights) {
     edge_weights.free();
   }
+
+  return CSRGraph(
+      std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), sorted
+  );
 }
 
-template void read<false>(
-    const std::string &filename,
-    StaticArray<EdgeID> &nodes,
-    StaticArray<NodeID> &edges,
-    StaticArray<NodeWeight> &node_weights,
-    StaticArray<EdgeWeight> &edge_weights
-);
+template CSRGraph csr_read<false>(const std::string &filename, const bool sorted);
+template CSRGraph csr_read<true>(const std::string &filename, const bool sorted);
+
+template <bool checked>
+std::optional<CompressedGraph>
+compress_read(const std::string &filename, const bool sorted, const bool may_dismiss) {
+  using namespace kaminpar::io::metis;
+
+  std::uint64_t number_of_nodes;
+  bool store_node_weights;
+  bool store_edge_weights;
+
+  std::size_t uncompressed_edge_array_size;
+  bool dismissed = false;
+
+  NodeID node = 0;
+  EdgeID edge = 0;
+
+  CompressedGraphBuilder builder;
+  RECORD("neighbourhood") std::vector<std::pair<NodeID, EdgeWeight>> neighbourhood;
+  RECORD_LOCAL_DATA_STRUCT("vector<pair<NodeID, EdgeWeight>>", 0, neighbourhood_stats);
+
+  parse<false>(
+      filename,
+      [&](const auto &format) {
+        check_format<checked>(format);
+
+        const std::size_t max_size = CompressedGraphBuilder::compressed_edge_array_max_size(
+            format.number_of_nodes, format.number_of_edges
+        );
+        const std::size_t node_array_diff =
+            (sizeof(EdgeID) - math::byte_width(max_size)) * (format.number_of_nodes + 1);
+
+        number_of_nodes = format.number_of_nodes + 1;
+        uncompressed_edge_array_size =
+            format.number_of_edges * sizeof(NodeID) * 2 + node_array_diff;
+        store_node_weights = format.has_node_weights;
+        store_edge_weights = format.has_edge_weights;
+
+        builder.init(
+            format.number_of_nodes,
+            format.number_of_edges,
+            store_node_weights,
+            store_edge_weights,
+            sorted
+        );
+      },
+      [&](const std::uint64_t weight) {
+        check_node_weight<checked>(weight);
+
+        if (node > 0) {
+          builder.add_node(node - 1, neighbourhood);
+
+          if (may_dismiss && builder.edge_array_size() > uncompressed_edge_array_size) {
+            dismissed = true;
+            return false;
+          }
+
+          neighbourhood.clear();
+        }
+
+        if (store_node_weights) {
+          builder.set_node_weight(node, static_cast<NodeWeight>(weight));
+        }
+
+        node += 1;
+        return true;
+      },
+      [&](const std::uint64_t weight, const std::uint64_t v) {
+        check_edge<checked>(number_of_nodes, node, weight, v);
+
+        neighbourhood.push_back(std::pair(static_cast<NodeID>(v), static_cast<EdgeWeight>(weight)));
+        edge += 1;
+      }
+  );
+
+  if (dismissed) {
+    return std::nullopt;
+  }
+
+  builder.add_node(node - 1, neighbourhood);
+
+  check_total_weight<checked>(builder.total_node_weight(), builder.total_edge_weight());
+  IF_HEAP_PROFILING(neighbourhood_stats->size = neighbourhood.capacity() * sizeof(NodeID));
+
+  return builder.build();
+}
+
+template std::optional<CompressedGraph>
+compress_read<false>(const std::string &filename, const bool sorted, const bool may_dismiss);
+template std::optional<CompressedGraph>
+compress_read<true>(const std::string &filename, const bool sorted, const bool may_dismiss);
+
+void write(const std::string &filename, const Graph &graph) {
+  std::ofstream out(filename);
+
+  out << graph.n() << ' ' << (graph.m() / 2);
+  if (graph.node_weighted() || graph.edge_weighted()) {
+    out << ' ';
+
+    if (graph.node_weighted()) {
+      out << '1';
+    }
+
+    out << (graph.edge_weighted() ? '1' : '0');
+  }
+  out << '\n';
+
+  for (const NodeID node : graph.nodes()) {
+    if (graph.node_weighted()) {
+      out << graph.node_weight(node) << ' ';
+    }
+
+    graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      out << (adjacent_node + 1) << ' ';
+
+      if (graph.edge_weighted()) {
+        out << graph.edge_weight(incident_edge) << ' ';
+      }
+    });
+
+    out << '\n';
+  }
+}
 
-template void read<true>(
-    const std::string &filename,
-    StaticArray<EdgeID> &nodes,
-    StaticArray<NodeID> &edges,
-    StaticArray<NodeWeight> &node_weights,
-    StaticArray<EdgeWeight> &edge_weights
-);
 } // namespace metis
 
+std::unordered_map<std::string, GraphFileFormat> get_graph_file_formats() {
+  return {
+      {"metis", GraphFileFormat::METIS},
+      {"parhip", GraphFileFormat::PARHIP},
+  };
+}
+
+Graph read(
+    const std::string &filename,
+    const GraphFileFormat file_format,
+    const bool compress,
+    const bool may_dismiss,
+    const bool sorted,
+    const bool validate
+) {
+  if (compressed_binary::is_compressed(filename)) {
+    if (!compress) {
+      LOG_ERROR
+          << "The input graph is stored in a compressed format but graph compression is disabled!";
+      std::exit(1);
+    }
+
+    return Graph(std::make_unique<CompressedGraph>(compressed_binary::read(filename)));
+  }
+
+  if (compress) {
+    std::optional<CompressedGraph> compresed_graph = [&] {
+      if (validate) {
+        return metis::compress_read<true>(filename, sorted, may_dismiss);
+      } else {
+        switch (file_format) {
+        case GraphFileFormat::METIS:
+          return metis::compress_read<false>(filename, sorted, may_dismiss);
+        case GraphFileFormat::PARHIP:
+          return std::optional(parhip::compressed_read(filename, sorted));
+        default:
+          throw std::runtime_error("unexpected graph file format");
+        }
+      }
+    }();
+
+    if (compresed_graph) {
+      return Graph(std::make_unique<CompressedGraph>(std::move(*compresed_graph)));
+    }
+  }
+
+  if (validate) {
+    CSRGraph csr_graph = metis::csr_read<true>(filename, sorted);
+
+    shm::validate_undirected_graph(
+        csr_graph.raw_nodes(),
+        csr_graph.raw_edges(),
+        csr_graph.raw_node_weights(),
+        csr_graph.raw_edge_weights()
+    );
+
+    return Graph(std::make_unique<CSRGraph>(std::move(csr_graph)));
+  } else {
+    switch (file_format) {
+    case GraphFileFormat::METIS:
+      return Graph(std::make_unique<CSRGraph>(metis::csr_read<false>(filename, sorted)));
+    case GraphFileFormat::PARHIP:
+      return Graph(std::make_unique<CSRGraph>(parhip::csr_read(filename, sorted)));
+    default:
+      throw std::runtime_error("unexpected graph file format");
+    }
+  }
+}
+
 //
 // Partition
 //
diff --git a/apps/io/shm_io.h b/apps/io/shm_io.h
index cc5e4bbb..7136ce87 100644
--- a/apps/io/shm_io.h
+++ b/apps/io/shm_io.h
@@ -7,25 +7,91 @@
  ******************************************************************************/
 #pragma once
 
+#include <optional>
 #include <string>
 #include <vector>
 
+#include "kaminpar-shm/datastructures/compressed_graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
+#include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/kaminpar.h"
 
 #include "kaminpar-common/datastructures/static_array.h"
 
 namespace kaminpar::shm::io {
 namespace metis {
+
+/**
+ * Reads a graph that is stored in a file in METIS format.
+ *
+ * @param filename The name of the file to read.
+ * @param sorted Whether the nodes of the graph to read are stored in deg-buckets order.
+ * @tparam checked Whether to validate the read graph.
+ * @return The graph in compressed sparse row format stored in the file.
+ */
+template <bool checked> CSRGraph csr_read(const std::string &filename, const bool sorted = false);
+
+/*!
+ * Reads and compresses a graph that is stored in a file in METIS format.
+ *
+ * @param filename The name of the file to read.
+ * @param sorted Whether the nodes of the graph to read are stored in deg-buckets order.
+ * @param may_dismiss Whether the reading process is aborted when the compressed graph uses more
+ * memory than the uncompressed graph.
+ * @tparam checked Whether to validate the read graph.
+ * @return The graph in compressed form stored in the file.
+ */
 template <bool checked>
-void read(
-    const std::string &filename,
-    StaticArray<EdgeID> &nodes,
-    StaticArray<NodeID> &edges,
-    StaticArray<NodeWeight> &node_weights,
-    StaticArray<EdgeWeight> &edge_weights
+std::optional<CompressedGraph> compress_read(
+    const std::string &filename, const bool sorted = false, const bool may_dismiss = false
 );
+
+/*!
+ * Writes a graph to a file in METIS format.
+ *
+ * @param filename The name of the file for saving the graph.
+ * @param graph The graph to save.
+ */
+void write(const std::string &filename, const Graph &graph);
+
 } // namespace metis
 
+/*!
+ * All graph file formats that can be parsed.
+ */
+enum class GraphFileFormat {
+  METIS,
+  PARHIP
+};
+
+/*!
+ * Returns a table which maps identifiers to their corresponding graph file format.
+ *
+ * @return A table which maps identifiers to their corresponding graph file format.
+ */
+std::unordered_map<std::string, GraphFileFormat> get_graph_file_formats();
+
+/*!
+ * Reads a graph that is either stored in METIS or compressed format.
+ *
+ * @param filename The name of the file to read.
+ * @param file_format The format of the file used to store the graph.
+ * @param compress Whether to compress the graph.
+ * @param may_dismiss Whether the compressed graph is only returned when it uses less memory than
+ * the uncompressed graph.
+ * @param sorted Whether the nodes of the graph to read are stored in deg-buckets order.
+ * @param validate Whether to validate the graph.
+ * @return The graph to read.
+ */
+Graph read(
+    const std::string &filename,
+    const GraphFileFormat file_format,
+    const bool compress,
+    const bool may_dismiss,
+    const bool sorted,
+    const bool validate
+);
+
 namespace partition {
 std::vector<BlockID> read(const std::string &filename);
 void write(const std::string &filename, const std::vector<BlockID> &partition);
diff --git a/apps/tools/CMakeLists.txt b/apps/tools/CMakeLists.txt
new file mode 100644
index 00000000..08f0c5ab
--- /dev/null
+++ b/apps/tools/CMakeLists.txt
@@ -0,0 +1,10 @@
+function(add_shm_tool target)
+    add_executable(${target} ${ARGN})
+    target_link_libraries(${target} PRIVATE KaMinPar::KaMinPar KaMinPar::KaMinParIO)
+    message(STATUS "Enabled tool: ${target}")
+endfunction()
+
+# Shared-memory tools
+add_shm_tool(shm_graph_compression_tool shm_graph_compression_tool.cc)
+add_shm_tool(shm_graph_properties_tool shm_graph_properties_tool.cc)
+add_shm_tool(shm_graph_rearrangement_tool shm_graph_rearrangement_tool.cc)
diff --git a/apps/tools/shm_graph_compression_tool.cc b/apps/tools/shm_graph_compression_tool.cc
new file mode 100644
index 00000000..7db47575
--- /dev/null
+++ b/apps/tools/shm_graph_compression_tool.cc
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ * Graph compression tool for the shared-memory algorithm.
+ *
+ * @file:   shm_graph_compression_tool.cc
+ * @author: Daniel Salwasser
+ * @date:   14.12.2023
+ ******************************************************************************/
+// clang-format off
+#include <kaminpar-cli/kaminpar_arguments.h>
+// clang-format on
+
+#include <tbb/global_control.h>
+
+#include "kaminpar-common/logger.h"
+
+#include "apps/io/shm_compressed_graph_binary.h"
+#include "apps/io/shm_io.h"
+
+using namespace kaminpar;
+using namespace kaminpar::shm;
+
+int main(int argc, char *argv[]) {
+  // Parse CLI arguments
+  std::string graph_filename;
+  std::string compressed_graph_filename;
+  int num_threads = 1;
+
+  CLI::App app("Shared-memory graph compression tool");
+  app.add_option("-G,--graph", graph_filename, "Input graph in METIS format")->required();
+  app.add_option("--out", compressed_graph_filename, "Ouput file for saving the compressed graph")
+      ->required();
+  app.add_option("-t,--threads", num_threads, "Number of threads");
+  CLI11_PARSE(app, argc, argv);
+
+  tbb::global_control gc(tbb::global_control::max_allowed_parallelism, num_threads);
+
+  LOG << "Reading input graph...";
+  CompressedGraph graph = *io::metis::compress_read<false>(graph_filename);
+
+  LOG << "Writing compressed graph...";
+  io::compressed_binary::write(compressed_graph_filename, graph);
+
+  return 0;
+}
diff --git a/apps/tools/shm_graph_properties_tool.cc b/apps/tools/shm_graph_properties_tool.cc
new file mode 100644
index 00000000..35c2e82a
--- /dev/null
+++ b/apps/tools/shm_graph_properties_tool.cc
@@ -0,0 +1,110 @@
+/*******************************************************************************
+ * Graph properties tool for the shared-memory algorithm.
+ *
+ * @file:   shm_graph_properties_tool.cc
+ * @author: Daniel Salwasser
+ * @date:   26.12.2023
+ ******************************************************************************/
+// clang-format off
+#include <kaminpar-cli/kaminpar_arguments.h>
+// clang-format on
+
+#include <tbb/global_control.h>
+
+#include "kaminpar-shm/context_io.h"
+
+#include "kaminpar-common/console_io.h"
+#include "kaminpar-common/logger.h"
+#include "kaminpar-common/strutils.h"
+
+#include "apps/io/shm_io.h"
+
+using namespace kaminpar;
+using namespace kaminpar::shm;
+
+float average_degree(const Graph &graph) {
+  std::size_t average_degree = 0;
+
+  for (const NodeID node : graph.nodes()) {
+    average_degree += graph.degree(node);
+  }
+
+  return average_degree / (float)graph.n();
+}
+
+NodeID isolated_nodes(const Graph &graph) {
+  NodeID count = 0;
+
+  for (const NodeID node : graph.nodes()) {
+    if (graph.degree(node) == 0) {
+      count++;
+    }
+  }
+
+  return count;
+}
+
+void print_graph_properties(const Graph &graph, const Context ctx, std::ostream &out) {
+  const float avg_deg = average_degree(graph);
+  const NodeID isolated_node_count = isolated_nodes(graph);
+  const std::size_t width = std::ceil(std::log10(
+      std::max<std::size_t>({graph.n(), graph.m(), graph.max_degree(), isolated_node_count})
+  ));
+
+  cio::print_delimiter("Graph Properties", '#');
+  out << "Graph:                        " << ctx.debug.graph_name << "\n";
+  out << "  Number of nodes:            " << std::setw(width) << graph.n();
+  if (graph.node_weighted()) {
+    out << " (total weight: " << graph.total_node_weight() << ")\n";
+  } else {
+    out << " (unweighted)\n";
+  }
+  out << "  Number of edges:            " << std::setw(width) << graph.m();
+  if (graph.edge_weighted()) {
+    out << " (total weight: " << graph.total_edge_weight() << ")\n";
+  } else {
+    out << " (unweighted)\n";
+  }
+  out << "  Max degree:                 " << std::setw(width) << graph.max_degree() << '\n';
+  out << "  Average degree:             " << std::setw(width) << avg_deg << '\n';
+  out << "  Isolated nodes:             " << std::setw(width) << isolated_node_count << '\n';
+
+  cio::print_delimiter("Graph Compression", '-');
+  print(ctx.compression, out);
+}
+
+int main(int argc, char *argv[]) {
+  Context ctx = create_default_context();
+  std::string graph_filename;
+  io::GraphFileFormat graph_file_format = io::GraphFileFormat::METIS;
+
+  CLI::App app("Shared-memory graph properties tool");
+  app.add_option("-G,--graph", graph_filename, "Input graph in METIS format")->required();
+  app.add_option("-t,--threads", ctx.parallel.num_threads, "Number of threads");
+  app.add_option("-f,--graph-file-format", graph_file_format)
+      ->transform(CLI::CheckedTransformer(io::get_graph_file_formats()).description(""))
+      ->description(R"(Graph file formats:
+  - metis
+  - parhip)")
+      ->capture_default_str();
+  create_graph_compression_options(&app, ctx);
+  CLI11_PARSE(app, argc, argv);
+
+  tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
+
+  Graph graph = io::read(
+      graph_filename,
+      graph_file_format,
+      ctx.compression.enabled,
+      ctx.compression.may_dismiss,
+      false,
+      false
+  );
+
+  ctx.debug.graph_name = str::extract_basename(graph_filename);
+  ctx.compression.setup(graph);
+
+  print_graph_properties(graph, ctx, std::cout);
+
+  return 0;
+}
diff --git a/apps/tools/shm_graph_rearrangement_tool.cc b/apps/tools/shm_graph_rearrangement_tool.cc
new file mode 100644
index 00000000..3a8c77bf
--- /dev/null
+++ b/apps/tools/shm_graph_rearrangement_tool.cc
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ * CSR graph rearrangement tool for the shared-memory algorithm.
+ *
+ * @file:   shm_graph_rearrangement_tool.cc
+ * @author: Daniel Salwasser
+ * @date:   14.12.2023
+ ******************************************************************************/
+// clang-format off
+#include <kaminpar-cli/kaminpar_arguments.h>
+// clang-format on
+
+#include <tbb/global_control.h>
+
+#include "kaminpar-shm/graphutils/permutator.h"
+
+#include "kaminpar-common/logger.h"
+
+#include "apps/io/shm_io.h"
+
+using namespace kaminpar;
+using namespace kaminpar::shm;
+
+int main(int argc, char *argv[]) {
+  Context ctx = create_default_context();
+  ctx.partition.k = 0;
+
+  // Parse CLI arguments
+  std::string graph_filename;
+  std::string out_graph_filename;
+
+  CLI::App app("Shared-memory graph rearrangement tool");
+  app.add_option("-G,--graph", graph_filename, "Input graph in METIS format")->required();
+  app.add_option("-O,--out", out_graph_filename, "Ouput file for saving the rearranged graph")
+      ->required();
+  app.add_option("-t,--threads", ctx.parallel.num_threads, "Number of threads");
+  create_partitioning_rearrangement_options(&app, ctx);
+  CLI11_PARSE(app, argc, argv);
+
+  tbb::global_control gc(tbb::global_control::max_allowed_parallelism, ctx.parallel.num_threads);
+
+  LOG << "Reading input graph...";
+  CSRGraph input_graph = io::metis::csr_read<false>(
+      graph_filename, ctx.node_ordering == NodeOrdering::IMPLICIT_DEGREE_BUCKETS
+  );
+
+  Graph graph(std::make_unique<CSRGraph>(std::move(input_graph)));
+  CSRGraph &csr_graph = *dynamic_cast<CSRGraph *>(graph.underlying_graph());
+
+  LOG << "Rearranging graph...";
+  if (ctx.node_ordering == NodeOrdering::DEGREE_BUCKETS) {
+    graph = graph::rearrange_by_degree_buckets(csr_graph);
+    graph::integrate_isolated_nodes(graph, ctx.partition.epsilon, ctx);
+  }
+
+  if (ctx.edge_ordering == EdgeOrdering::COMPRESSION) {
+    graph::reorder_edges_by_compression(csr_graph);
+  }
+
+  LOG << "Writing graph...";
+  io::metis::write(out_graph_filename, graph);
+
+  return 0;
+}
diff --git a/external/KaGen b/external/KaGen
index 3882802d..2be1e625 160000
--- a/external/KaGen
+++ b/external/KaGen
@@ -1 +1 @@
-Subproject commit 3882802dd5a336775360157debbf589efd4a024f
+Subproject commit 2be1e6257211d1caf02ec7a07a5027ec7a60a63a
diff --git a/external/googletest b/external/googletest
index 3b6d48e8..5a37b517 160000
--- a/external/googletest
+++ b/external/googletest
@@ -1 +1 @@
-Subproject commit 3b6d48e8d5c1d9b3f9f10ac030a94008bfaf032b
+Subproject commit 5a37b517ad4ab6738556f0284c256cae1466c5b4
diff --git a/external/kassert b/external/kassert
index f0873f85..e683aefa 160000
--- a/external/kassert
+++ b/external/kassert
@@ -1 +1 @@
-Subproject commit f0873f85ff046c6dee35a85148a51bfab73af44a
+Subproject commit e683aefaa8e10ca9683a8c5bf1d63ff986f77cdd
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 00000000..39b28ea8
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,61 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1710146030,
+        "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1712963716,
+        "narHash": "sha256-WKm9CvgCldeIVvRz87iOMi8CFVB1apJlkUT4GGvA0iM=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "cfd6b5fc90b15709b780a5a1619695a88505a176",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 00000000..1501084a
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,55 @@
+{
+  description = "Shared-memory and distributed graph partitioner for large k partitioning.";
+
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+
+  outputs = { self, nixpkgs, flake-utils, ... }: flake-utils.lib.eachDefaultSystem (system:
+    let
+      pkgs = import nixpkgs { inherit system; };
+      inputs = builtins.attrValues {
+        inherit (pkgs) cmake ninja python312 gcc13 tbb_2021_11 sparsehash mpi;
+      };
+    in
+    {
+      devShells.default = pkgs.mkShell {
+        packages = inputs ++ builtins.attrValues {
+          inherit (pkgs) fish ccache gdb;
+        };
+
+        shellHook = ''
+          exec fish
+        '';
+      };
+
+      devShells.clang = (pkgs.mkShell.override { stdenv = pkgs.llvmPackages_18.stdenv; }) {
+        packages = (pkgs.lib.lists.remove pkgs.gcc13 inputs) ++ builtins.attrValues {
+          inherit (pkgs) fish ccache gdb;
+        };
+
+        shellHook = ''
+          exec fish
+        '';
+      };
+
+      packages.default = pkgs.stdenv.mkDerivation {
+        pname = "KaMinPar";
+        version = "2.1.0";
+
+        src = self;
+        nativeBuildInputs = inputs;
+
+        cmakeFlags = [ "-DKAMINPAR_BUILD_DISTRIBUTED=On" ];
+        enableParallelBuilding = true;
+
+        meta = {
+          description = "Shared-memory and distributed graph partitioner for large k partitioning.";
+          homepage = "https://github.com/KaHIP/KaMinPar";
+          license = pkgs.lib.licenses.mit;
+        };
+      };
+    }
+  );
+}
diff --git a/kaminpar-cli/kaminpar_arguments.cc b/kaminpar-cli/kaminpar_arguments.cc
index 14d98fce..1f67d8e4 100644
--- a/kaminpar-cli/kaminpar_arguments.cc
+++ b/kaminpar-cli/kaminpar_arguments.cc
@@ -14,6 +14,7 @@
 
 namespace kaminpar::shm {
 void create_all_options(CLI::App *app, Context &ctx) {
+  create_graph_compression_options(app, ctx);
   create_partitioning_options(app, ctx);
   create_debug_options(app, ctx);
   create_coarsening_options(app, ctx);
@@ -21,6 +22,23 @@ void create_all_options(CLI::App *app, Context &ctx) {
   create_refinement_options(app, ctx);
 }
 
+CLI::Option_group *create_graph_compression_options(CLI::App *app, Context &ctx) {
+  auto *compression = app->add_option_group("Graph Compression");
+
+  compression->add_flag("-c,--compress", ctx.compression.enabled, "Enable graph compression")
+      ->default_val(false);
+  compression
+      ->add_flag(
+          "--may-dismiss",
+          ctx.compression.may_dismiss,
+          "Whether the compressed graph is only used if it uses less memory than the uncompressed "
+          "graph."
+      )
+      ->default_val(false);
+
+  return compression;
+}
+
 CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx) {
   auto *partitioning = app->add_option_group("Partitioning");
 
@@ -52,29 +70,58 @@ CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx) {
   - async-parallel: diversify initial partitioning by replicating coarse graphs each branch of the replication tree asynchronously
   - sync-parallel:  same as async-parallel, but process branches synchronously)")
       ->capture_default_str();
+  partitioning->add_option(
+      "--p-deep-initial-partitioning-load",
+      ctx.partitioning.deep_initial_partitioning_load,
+      "Fraction of cores that should be used for the coarse graph replication phase of deep MGP. A "
+      "value of '1' will replicate the graph once for every PE, whereas smaller values lead to "
+      "fewer replications."
+  );
   partitioning
       ->add_option(
-          "--p-deep-initial-partitioning-load",
-          ctx.partitioning.deep_initial_partitioning_load,
-          "Fraction of cores that should be used for the coarse graph replication phase of deep "
-          "MGP. A "
-          "value of '1' will replicate the graph once for every PE, whereas smaller values lead to "
-          "fewer replications."
+          "--p-min-consecutive-seq-bipartitioning-levels",
+          ctx.partitioning.min_consecutive_seq_bipartitioning_levels,
+          "(set to '0' for the old behaviour)"
       )
       ->capture_default_str();
-  partitioning->add_option("--rearrange-by", ctx.rearrange_by)
-      ->transform(CLI::CheckedTransformer(get_graph_orderings()).description(""))
-      ->description(R"(Criteria by which the graph is sorted and rearrange:
-  - natural:     keep order of the graph (do not rearrange)
-  - deg-buckets: sort nodes by degree bucket and rearrange accordingly)")
-      ->capture_default_str();
+
+  create_partitioning_rearrangement_options(app, ctx);
 
   return partitioning;
 }
 
+CLI::Option_group *create_partitioning_rearrangement_options(CLI::App *app, Context &ctx) {
+  auto *rearrangement = app->add_option_group("Partitioning -> Rearrangement");
+
+  rearrangement->add_option("--node-order", ctx.node_ordering)
+      ->transform(CLI::CheckedTransformer(get_node_orderings()).description(""))
+      ->description(R"(Criteria by which the nodes of the graph are sorted and rearranged:
+  - natural:     keep node order of the graph (do not rearrange)
+  - deg-buckets: sort nodes by degree bucket and rearrange accordingly
+  - implicit-deg-buckets: nodes of the input graph are sorted by deg-buckets order)")
+      ->capture_default_str();
+  rearrangement->add_option("--edge-order", ctx.edge_ordering)
+      ->transform(CLI::CheckedTransformer(get_edge_orderings()).description(""))
+      ->description(R"(Criteria by which the edges of the graph are sorted and rearranged:
+  - natural:     keep edge order of the graph (do not rearrange)
+  - compression: sort the edges of each neighbourhood with the ordering of the corresponding compressed graph)"
+      )
+      ->capture_default_str();
+
+  return rearrangement;
+}
+
 CLI::Option_group *create_coarsening_options(CLI::App *app, Context &ctx) {
   auto *coarsening = app->add_option_group("Coarsening");
 
+  // Coarsening options:
+  coarsening->add_option("--c-algorithm", ctx.coarsening.algorithm)
+      ->transform(CLI::CheckedTransformer(get_coarsening_algorithms()).description(""))
+      ->description(R"(One of the following options:
+  - noop:       disable coarsening
+  - clustering: coarsening by clustering and contracting)")
+      ->capture_default_str();
+
   coarsening
       ->add_option(
           "--c-contraction-limit",
@@ -83,14 +130,24 @@ CLI::Option_group *create_coarsening_options(CLI::App *app, Context &ctx) {
       )
       ->capture_default_str();
 
-  coarsening->add_option("--c-clustering-algorithm", ctx.coarsening.algorithm)
+  coarsening
+      ->add_option(
+          "--c-convergence-threshold",
+          ctx.coarsening.convergence_threshold,
+          "Coarsening converges once the size of the graph shrinks by "
+          "less than this factor."
+      )
+      ->capture_default_str();
+
+  // Clustering options:
+  coarsening->add_option("--c-clustering-algorithm", ctx.coarsening.clustering.algorithm)
       ->transform(CLI::CheckedTransformer(get_clustering_algorithms()).description(""))
       ->description(R"(One of the following options:
   - noop: disable coarsening
   - lp:   size-constrained label propagation)")
       ->capture_default_str();
 
-  coarsening->add_option("--c-cluster-weight-limit", ctx.coarsening.cluster_weight_limit)
+  coarsening->add_option("--c-cluster-weight-limit", ctx.coarsening.clustering.cluster_weight_limit)
       ->transform(CLI::CheckedTransformer(get_cluster_weight_limits()).description(""))
       ->description(
           R"(This option selects the formula used to compute the weight limit for nodes in coarse graphs. 
@@ -106,21 +163,22 @@ Options are:
   coarsening
       ->add_option(
           "--c-cluster-weight-multiplier",
-          ctx.coarsening.cluster_weight_multiplier,
+          ctx.coarsening.clustering.cluster_weight_multiplier,
           "Multiplicator of the maximum cluster weight base value."
       )
       ->capture_default_str();
 
   coarsening
       ->add_option(
-          "--c-coarsening-convergence-threshold",
-          ctx.coarsening.convergence_threshold,
-          "Coarsening converges once the size of the graph shrinks by "
-          "less than this factor."
+          "--c-max-memory-free-coarsening-level",
+          ctx.coarsening.clustering.max_mem_free_coarsening_level,
+          "Maximum coarsening level for which the corresponding memory should be released "
+          "afterwards"
       )
       ->capture_default_str();
 
   create_lp_coarsening_options(app, ctx);
+  create_contraction_coarsening_options(app, ctx);
 
   return coarsening;
 }
@@ -130,24 +188,70 @@ CLI::Option_group *create_lp_coarsening_options(CLI::App *app, Context &ctx) {
 
   lp->add_option(
         "--c-lp-num-iterations",
-        ctx.coarsening.lp.num_iterations,
+        ctx.coarsening.clustering.lp.num_iterations,
         "Maximum number of label propagation iterations"
   )
       ->capture_default_str();
   lp->add_option(
         "--c-lp-active-large-degree-threshold",
-        ctx.coarsening.lp.large_degree_threshold,
+        ctx.coarsening.clustering.lp.large_degree_threshold,
         "Threshold for ignoring nodes with large degree"
   )
       ->capture_default_str();
   lp->add_option(
         "--c-lp-max-num-neighbors",
-        ctx.coarsening.lp.max_num_neighbors,
+        ctx.coarsening.clustering.lp.max_num_neighbors,
         "Limit the neighborhood to this many nodes"
   )
       ->capture_default_str();
 
-  lp->add_option("--c-lp-two-hop-strategy", ctx.coarsening.lp.two_hop_strategy)
+  lp->add_option(
+        "--c-lp-use-two-level-cluster-weight-vector",
+        ctx.coarsening.clustering.lp.use_two_level_cluster_weight_vector,
+        "Whether to use the two level cluster weight vector"
+  )
+      ->capture_default_str();
+
+  lp->add_option(
+        "--c-lp-two-phases",
+        ctx.coarsening.clustering.lp.use_two_phases,
+        "Uses two phases in each iteration, where in the second phase the high-degree nodes are "
+        "treated separately"
+  )
+      ->capture_default_str();
+  lp->add_option(
+        "--c-lp-second-phase-select-mode", ctx.coarsening.clustering.lp.second_phase_select_mode
+  )
+      ->transform(CLI::CheckedTransformer(get_second_phase_select_modes()).description(""))
+      ->description(
+          R"(Determines the mode for selecting nodes for the second phase of label propagation.
+Options are:
+  - high-degree:     Select nodes with high degree
+  - full-rating-map: Select nodes which have a full rating map in the first phase
+  )"
+      )
+      ->capture_default_str();
+  lp->add_option(
+        "--c-lp-second-phase-aggregation-mode",
+        ctx.coarsening.clustering.lp.second_phase_aggregation_mode
+  )
+      ->transform(CLI::CheckedTransformer(get_second_phase_aggregation_modes()).description(""))
+      ->description(
+          R"(Determines the mode for aggregating ratings in the second phase of label propagation.
+Options are:
+  - none:     Skip the second phase
+  - direct:   Write the ratings directly into the global vector (shared between threads)
+  - buffered: Write the ratings into a thread-local buffer and then copy them into the global vector when the buffer is full
+  )"
+      );
+  lp->add_option(
+        "--c-lp-second-phase-relabel",
+        ctx.coarsening.clustering.lp.relabel_before_second_phase,
+        "Relabel the clusters before running the second phase"
+  )
+      ->capture_default_str();
+
+  lp->add_option("--c-lp-two-hop-strategy", ctx.coarsening.clustering.lp.two_hop_strategy)
       ->transform(CLI::CheckedTransformer(get_two_hop_strategies()).description(""))
       ->description(R"(Determines the strategy for handling singleton clusters during coarsening.
 Options are:
@@ -159,13 +263,15 @@ Options are:
       ->capture_default_str();
   lp->add_option(
         "--c-lp-two-hop-threshold",
-        ctx.coarsening.lp.two_hop_threshold,
+        ctx.coarsening.clustering.lp.two_hop_threshold,
         "Enable two-hop clustering if plain label propagation shrunk "
         "the graph by less than this factor"
   )
       ->capture_default_str();
 
-  lp->add_option("--c-lp-isolated-nodes-strategy", ctx.coarsening.lp.isolated_nodes_strategy)
+  lp->add_option(
+        "--c-lp-isolated-nodes-strategy", ctx.coarsening.clustering.lp.isolated_nodes_strategy
+  )
       ->transform(
           CLI::CheckedTransformer(get_isolated_nodes_clustering_strategies()).description("")
       )
@@ -182,6 +288,32 @@ Options are:
   return lp;
 }
 
+CLI::Option_group *create_contraction_coarsening_options(CLI::App *app, Context &ctx) {
+  auto *contraction = app->add_option_group("Coarsening -> Contraction");
+
+  contraction->add_option("--c-con-mode", ctx.coarsening.contraction.mode)
+      ->transform(CLI::CheckedTransformer(get_contraction_modes()).description(""))
+      ->description(R"(The mode useed for contraction.
+Options are:
+  - edge-buffer:            Use an edge buffer to store edges temporarily
+  - no-edge-buffer-naive:   Use no edge buffer by computing the neighborhood of each coarse node twice
+  - no-edge-buffer-remap:   Use no edge buffer by remapping the coarse nodes afterwards
+  )")
+      ->capture_default_str();
+  contraction
+      ->add_option(
+          "--c-con-edge-buffer-fill-fraction",
+          ctx.coarsening.contraction.edge_buffer_fill_fraction,
+          "The fraction of the total edges with which to fill the edge buffer"
+      )
+      ->capture_default_str();
+  contraction->add_flag(
+      "--c-con-use-compact-mapping", ctx.coarsening.contraction.use_compact_mapping
+  );
+
+  return contraction;
+}
+
 CLI::Option_group *create_initial_partitioning_options(CLI::App *app, Context &ctx) {
   auto *ip = app->add_option_group("Initial Partitioning");
 
@@ -240,6 +372,36 @@ CLI::Option_group *create_lp_refinement_options(CLI::App *app, Context &ctx) {
   )
       ->capture_default_str();
 
+  lp->add_option(
+        "--r-lp-two-phases",
+        ctx.refinement.lp.use_two_phases,
+        "Uses two phases in each iteration, where in the second phase the high-degree nodes are "
+        "treated separately"
+  )
+      ->capture_default_str();
+  lp->add_option("--r-lp-second-phase-select-mode", ctx.refinement.lp.second_phase_select_mode)
+      ->transform(CLI::CheckedTransformer(get_second_phase_select_modes()).description(""))
+      ->description(
+          R"(Determines the mode for selecting nodes for the second phase of label propagation.
+Options are:
+  - high-degree:     Select nodes with high degree
+  - full-rating-map: Select nodes which have a full rating map in the first phase
+  )"
+      )
+      ->capture_default_str();
+  lp->add_option(
+        "--r-lp-second-phase-aggregation-mode", ctx.refinement.lp.second_phase_aggregation_mode
+  )
+      ->transform(CLI::CheckedTransformer(get_second_phase_aggregation_modes()).description(""))
+      ->description(
+          R"(Determines the mode for aggregating ratings in the second phase of label propagation.
+Options are:
+  - none:     Skip the second phase
+  - direct:   Write the ratings directly into the global vector (shared between threads)
+  - buffered: Write the ratings into a thread-local buffer and then copy them into the global vector when the buffer is full
+  )"
+      );
+
   return lp;
 }
 
diff --git a/kaminpar-cli/kaminpar_arguments.h b/kaminpar-cli/kaminpar_arguments.h
index 9db5de3f..66a6e088 100644
--- a/kaminpar-cli/kaminpar_arguments.h
+++ b/kaminpar-cli/kaminpar_arguments.h
@@ -11,17 +11,23 @@
 #include "kaminpar-cli/CLI11.h"
 // clang-format on
 
-#include "kaminpar-shm/context.h"
+#include <kaminpar-shm/kaminpar.h>
 
 namespace kaminpar::shm {
 void create_all_options(CLI::App *app, Context &ctx);
 
+CLI::Option_group *create_graph_compression_options(CLI::App *app, Context &ctx);
+
 CLI::Option_group *create_partitioning_options(CLI::App *app, Context &ctx);
 
+CLI::Option_group *create_partitioning_rearrangement_options(CLI::App *app, Context &ctx);
+
 CLI::Option_group *create_coarsening_options(CLI::App *app, Context &ctx);
 
 CLI::Option_group *create_lp_coarsening_options(CLI::App *app, Context &ctx);
 
+CLI::Option_group *create_contraction_coarsening_options(CLI::App *app, Context &ctx);
+
 CLI::Option_group *create_initial_partitioning_options(CLI::App *app, Context &ctx);
 
 CLI::Option_group *create_refinement_options(CLI::App *app, Context &ctx);
diff --git a/kaminpar-common/CMakeLists.txt b/kaminpar-common/CMakeLists.txt
index d0bac9bb..8d5b989a 100644
--- a/kaminpar-common/CMakeLists.txt
+++ b/kaminpar-common/CMakeLists.txt
@@ -24,6 +24,9 @@ find_package(TBB REQUIRED)
 find_library(NUMA_LIB numa) # optional 
 
 target_link_libraries(kaminpar_common PUBLIC TBB::tbb TBB::tbbmalloc kassert::kassert)
+if (KAMINPAR_BUILD_WITH_GROWT)
+    target_link_libraries(kaminpar_common PUBLIC growt)
+endif ()
 if (NUMA_LIB) 
     target_link_libraries(kaminpar_common PUBLIC ${NUMA_LIB})
 endif ()
diff --git a/kaminpar-common/asserting_cast.h b/kaminpar-common/asserting_cast.h
index 244e5b78..8be237e5 100644
--- a/kaminpar-common/asserting_cast.h
+++ b/kaminpar-common/asserting_cast.h
@@ -58,4 +58,13 @@ template <typename To, typename From> To asserting_cast(const From value) {
   );
   return static_cast<To>(value);
 }
+
+template <int assertion_level, typename To, typename From> To asserting_cast(const From value) {
+  KASSERT(
+      in_range<To>(value),
+      value << " of type " << typeid(From).name() << " not in range of type " << typeid(To).name(),
+      assertion_level
+  );
+  return static_cast<To>(value);
+}
 } // namespace kaminpar
diff --git a/kaminpar-common/constexpr_utils.h b/kaminpar-common/constexpr_utils.h
new file mode 100644
index 00000000..e0c58fc3
--- /dev/null
+++ b/kaminpar-common/constexpr_utils.h
@@ -0,0 +1,54 @@
+/*******************************************************************************
+ * Utility functions for constant expressions.
+ *
+ * @file:   constexpr_utils.h
+ * @author: Daniel Salwasser
+ * @date:   29.12.2023
+ ******************************************************************************/
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+namespace kaminpar {
+
+/*!
+ * Invokes a function either directly or indirectly depending on a lambda.
+ *
+ * @tparam direct Whether to call the function directly.
+ * @tparam Lambda The type of the lambda to pass to the function.
+ * @tparam Function The type of the function to invoke.
+ * @param l The lambda to pass to the function.
+ * @param fun The function to invoke.
+ */
+template <bool direct, typename Lambda, typename Function>
+constexpr void invoke_maybe_indirect(Lambda &&l, Function &&fun) {
+  if constexpr (direct) {
+    fun(std::forward<Lambda>(l));
+  } else {
+    l([&](auto &&l2) { fun(std::forward<decltype(l2)>(l2)); });
+  }
+}
+
+// Utility functions for constexpr loops based on https://stackoverflow.com/a/47563100
+template <std::size_t N> struct Number {
+  static const constexpr auto value = N;
+};
+
+template <class Lambda, std::size_t... Is>
+constexpr void constexpr_for(Lambda &&l, std::index_sequence<Is...>) {
+  (l(Number<Is>::value), ...);
+}
+
+/*!
+ * Calls a lambda a specific amount of times with an index.
+ *
+ * @tparam N The amount of times to call a lambda.
+ * @tparam Lambda The type of lambda to call.
+ * @param l The lambda to call N times with the current number of times called.
+ */
+template <std::size_t N, typename Lambda> constexpr void constexpr_for(Lambda &&l) {
+  constexpr_for(std::forward<Lambda>(l), std::make_index_sequence<N>());
+}
+
+} // namespace kaminpar
diff --git a/kaminpar-common/datastructures/binary_heap.h b/kaminpar-common/datastructures/binary_heap.h
index 90c8d5da..481dd239 100644
--- a/kaminpar-common/datastructures/binary_heap.h
+++ b/kaminpar-common/datastructures/binary_heap.h
@@ -474,6 +474,8 @@ class DynamicBinaryForest {
     Key key;
   };
 
+  explicit DynamicBinaryForest() {}
+
   explicit DynamicBinaryForest(const std::size_t capacity, const std::size_t heaps)
       : _id_pos(capacity, kInvalidID),
         _heaps(heaps) {}
@@ -482,6 +484,11 @@ class DynamicBinaryForest {
   DynamicBinaryForest &operator=(const DynamicBinaryForest &) = delete;
   DynamicBinaryForest &operator=(DynamicBinaryForest &&) noexcept = default;
 
+  void init(const std::size_t capacity, const std::size_t heaps) {
+    _id_pos.resize(capacity, kInvalidID);
+    _heaps.resize(heaps);
+  }
+
   std::size_t capacity() const {
     return _id_pos.size();
   }
@@ -654,6 +661,8 @@ using DynamicBinaryMinForest = DynamicBinaryForest<ID, Key, binary_heap::min_hea
 
 template <typename ID, typename Key> class DynamicBinaryMinMaxForest {
 public:
+  DynamicBinaryMinMaxForest() {}
+
   DynamicBinaryMinMaxForest(const std::size_t capacity, const std::size_t heaps)
       : _max_forest(capacity, heaps),
         _min_forest(capacity, heaps) {}
@@ -664,6 +673,11 @@ template <typename ID, typename Key> class DynamicBinaryMinMaxForest {
   DynamicBinaryMinMaxForest(DynamicBinaryMinMaxForest &&) noexcept = default;
   DynamicBinaryMinMaxForest &operator=(DynamicBinaryMinMaxForest &&) noexcept = default;
 
+  void init(const std::size_t capacity, const std::size_t heaps) {
+    _max_forest.init(capacity, heaps);
+    _min_forest.init(capacity, heaps);
+  }
+
   [[nodiscard]] inline std::size_t capacity() const {
     return _max_forest.capacity();
   }
diff --git a/kaminpar-common/datastructures/compact_static_array.h b/kaminpar-common/datastructures/compact_static_array.h
new file mode 100644
index 00000000..5bcda3f5
--- /dev/null
+++ b/kaminpar-common/datastructures/compact_static_array.h
@@ -0,0 +1,322 @@
+/*******************************************************************************
+ * A static array which stores integers with only as many bytes as the largest
+ * integer requires.
+ *
+ * @file:   compact_static_array.h
+ * @author: Daniel Salwasser
+ * @date:   12.01.2024
+ ******************************************************************************/
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+namespace kaminpar {
+
+/*!
+ * A static array which stores integers with only as many bytes as the largest integer requires.
+ *
+ * @tparam Int The type of integer to store.
+ */
+template <typename Int> class CompactStaticArray {
+  static_assert(std::numeric_limits<Int>::is_integer);
+
+  class CompactStaticArrayIterator {
+  public:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = Int;
+    using reference = Int &;
+    using pointer = Int *;
+    using difference_type = std::ptrdiff_t;
+
+    CompactStaticArrayIterator(
+        const std::uint8_t byte_width, const Int mask, const std::uint8_t *data
+    )
+        : _byte_width(byte_width),
+          _mask(mask),
+          _data(data) {}
+
+    CompactStaticArrayIterator(const CompactStaticArrayIterator &other) = default;
+    CompactStaticArrayIterator &operator=(const CompactStaticArrayIterator &other) = default;
+
+    Int operator*() const {
+      return *reinterpret_cast<const Int *>(_data) & _mask;
+    }
+
+    pointer operator->() const {
+      return *reinterpret_cast<const Int *>(_data) & _mask;
+    }
+
+    reference operator[](const difference_type n) const {
+      return *reinterpret_cast<const Int *>(_data + _byte_width * n) & _mask;
+    }
+
+    CompactStaticArrayIterator &operator++() {
+      return _data += _byte_width, *this;
+    }
+
+    CompactStaticArrayIterator &operator--() {
+      return _data -= _byte_width, *this;
+    }
+
+    CompactStaticArrayIterator operator++(int) const {
+      return CompactStaticArrayIterator{_byte_width, _mask, _data + _byte_width};
+    }
+
+    CompactStaticArrayIterator operator--(int) const {
+      return CompactStaticArrayIterator{_byte_width, _mask, _data - _byte_width};
+    }
+
+    CompactStaticArrayIterator operator+(const difference_type n) const {
+      return CompactStaticArrayIterator{_byte_width, _mask, _data + _byte_width * n};
+    }
+
+    CompactStaticArrayIterator operator-(const difference_type n) const {
+      return CompactStaticArrayIterator{_byte_width, _mask, _data - _byte_width * n};
+    }
+
+    CompactStaticArrayIterator &operator+=(const difference_type n) {
+      return _data += _byte_width * n, *this;
+    }
+
+    CompactStaticArrayIterator &operator-=(const difference_type n) {
+      return _data -= _byte_width * n, *this;
+    }
+
+    difference_type operator+(const CompactStaticArrayIterator &other) const {
+      return (reinterpret_cast<difference_type>(_data) / _byte_width) +
+             (reinterpret_cast<difference_type>(other._data) / _byte_width);
+    }
+
+    difference_type operator-(const CompactStaticArrayIterator &other) const {
+      return (reinterpret_cast<difference_type>(_data) / _byte_width) -
+             (reinterpret_cast<difference_type>(other._data) / _byte_width);
+    }
+
+    bool operator==(const CompactStaticArrayIterator &other) const {
+      return _data == other._data;
+    }
+
+    bool operator!=(const CompactStaticArrayIterator &other) const {
+      return _data != other._data;
+    }
+
+    bool operator>(const CompactStaticArrayIterator &other) const {
+      return _data > other._data;
+    }
+
+    bool operator<(const CompactStaticArrayIterator &other) const {
+      return _data < other._data;
+    }
+
+    bool operator>=(const CompactStaticArrayIterator &other) const {
+      return _data >= other._ptr;
+    }
+
+    bool operator<=(const CompactStaticArrayIterator &other) const {
+      return _data <= other._data;
+    }
+
+  private:
+    const std::uint8_t _byte_width;
+    const Int _mask;
+    const std::uint8_t *_data;
+  };
+
+public:
+  using value_type = Int;
+  using size_type = std::size_t;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+  using iterator = CompactStaticArrayIterator;
+  using const_iterator = const CompactStaticArrayIterator;
+
+  /*!
+   * Constructs a new CompactStaticArray.
+   */
+  CompactStaticArray() : _byte_width(0), _size(0), _unrestricted_size(0) {
+    RECORD_DATA_STRUCT(0, _struct);
+  }
+
+  /*!
+   * Constructs a new CompactStaticArray.
+   *
+   * @param byte_width The number of bytes needed to store the largest integer in the array.
+   * @param size The number of values to store.
+   */
+  CompactStaticArray(const std::uint8_t byte_width, const std::size_t size) {
+    KASSERT(byte_width <= 8);
+    RECORD_DATA_STRUCT(0, _struct);
+
+    resize(byte_width, size);
+  }
+
+  /*!
+   * Constructs a new CompactStaticArray.
+   *
+   * @param byte_width The number of bytes needed to store the largest integer in the array.
+   * @param actual_size The number of bytes that the compact representation in memory uses.
+   * @param data The pointer to the memory location where the data is compactly stored.
+   */
+  CompactStaticArray(
+      const std::uint8_t byte_width,
+      const std::size_t actual_size,
+      std::unique_ptr<std::uint8_t[]> data
+  )
+      : _byte_width(byte_width),
+        _size(actual_size),
+        _values(std::move(data)),
+        _mask(
+            (byte_width == 8) ? std::numeric_limits<Int>::max()
+                              : (static_cast<std::uint64_t>(1) << (byte_width * 8)) - 1
+        ) {
+    KASSERT(byte_width <= 8);
+    RECORD_DATA_STRUCT(0, _struct);
+  }
+
+  CompactStaticArray(const CompactStaticArray &) = delete;
+  CompactStaticArray &operator=(const CompactStaticArray &) = delete;
+
+  CompactStaticArray(CompactStaticArray &&) noexcept = default;
+  CompactStaticArray &operator=(CompactStaticArray &&) noexcept = default;
+
+  /*!
+   * Resizes the array.
+   *
+   * @param byte_width The number of bytes needed to store the largest integer in the array.
+   * @param size The number of values to store.
+   */
+  void resize(const std::uint8_t byte_width, const std::size_t size) {
+    IF_HEAP_PROFILING(
+        _struct->size = std::max(_struct->size, byte_width * size + sizeof(Int) - byte_width)
+    );
+
+    _byte_width = byte_width;
+    _size = byte_width * size + sizeof(Int) - byte_width;
+    _unrestricted_size = _size;
+    _values = std::make_unique<std::uint8_t[]>(_size);
+    _mask = (byte_width == 8) ? std::numeric_limits<Int>::max()
+                              : (static_cast<std::uint64_t>(1) << (byte_width * 8)) - 1;
+  }
+
+  /*!
+   * Restricts the array to a specific size. This operation can be undone by calling the unrestrict
+   * method.
+   *
+   * @param new_size The number of values to be visible.
+   */
+  void restrict(const std::size_t new_size) {
+    _unrestricted_size = _size;
+    _size = _byte_width * new_size + sizeof(Int) - _byte_width;
+  }
+
+  /*!
+   * Undos the previous restriction. It does nothing when the restrict method has previously not
+   * been invoked.
+   */
+  void unrestrict() {
+    _size = _unrestricted_size;
+  }
+
+  /*!
+   * Stores an integer in the array.
+   *
+   * @param pos The position in the array at which to store the integer.
+   * @param value The value to store.
+   */
+  void write(const std::size_t pos, Int value) {
+    std::uint8_t *data = _values.get() + pos * _byte_width;
+
+    for (std::uint8_t i = 0; i < _byte_width; ++i) {
+      *data++ = value & 0b11111111;
+      value >>= 8;
+    }
+  }
+
+  /*!
+   * Accesses an integer in the array.
+   *
+   * @param pos The position of the integer in the array to return.
+   * @return The integer stored at the position in the array.
+   */
+  [[nodiscard]] Int operator[](const std::size_t pos) const {
+    return *reinterpret_cast<const Int *>(_values.get() + pos * _byte_width) & _mask;
+  }
+
+  /*!
+   * Returns an interator to the beginning.
+   *
+   * @return An interator to the beginning.
+   */
+  [[nodiscard]] CompactStaticArrayIterator begin() const {
+    return CompactStaticArrayIterator(_byte_width, _mask, _values.get());
+  }
+
+  /*!
+   * Returns an interator to the end.
+   *
+   * @return An interator to the end.
+   */
+  [[nodiscard]] CompactStaticArrayIterator end() const {
+    return CompactStaticArrayIterator(
+        _byte_width, _mask, _values.get() + _size - (sizeof(Int) - _byte_width)
+    );
+  }
+
+  /*!
+   * Returns whether the array is empty.
+   *
+   * @return Whether the array is empty.
+   */
+  [[nodiscard]] bool empty() const {
+    return _size == 0;
+  }
+
+  /*!
+   * Returns the amount of integers in the array.
+   *
+   * @return The amount of integers in the array.
+   */
+  [[nodiscard]] std::size_t size() const {
+    return (_size - (sizeof(Int) - _byte_width)) / _byte_width;
+  }
+
+  /*!
+   * Returns the number of bytes needed to store the largest integer in the array.
+   *
+   * @return The number of bytes needed to store the largest integer in the array.
+   */
+  [[nodiscard]] std::uint8_t byte_width() const {
+    return _byte_width;
+  }
+
+  /*!
+   * Returns the amount of bytes the compact array allocated.
+   *
+   * @return The amount of bytes the compact array allocated.
+   */
+  [[nodiscard]] std::size_t allocated_size() const {
+    return _size;
+  }
+
+  /*!
+   * Returns a pointer to the memory location where the data is compactly stored.
+   *
+   * @returns A pointer to the memory location where the data is compactly stored.
+   */
+  [[nodiscard]] const std::uint8_t *data() const {
+    return _values.get();
+  }
+
+private:
+  std::uint8_t _byte_width;
+  std::size_t _size;
+  std::size_t _unrestricted_size;
+  std::unique_ptr<std::uint8_t[]> _values;
+  Int _mask;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
+};
+
+}; // namespace kaminpar
diff --git a/kaminpar-common/datastructures/concurrent_bit_vector.h b/kaminpar-common/datastructures/concurrent_bit_vector.h
new file mode 100644
index 00000000..86b0a1e4
--- /dev/null
+++ b/kaminpar-common/datastructures/concurrent_bit_vector.h
@@ -0,0 +1,140 @@
+/*******************************************************************************
+ * A concurrent bit vector which stores bits compactly and uses atomic read/write operations.
+ *
+ * @file:   concurrent_bit_vector.h
+ * @author: Daniel Salwasser
+ * @date:   25.01.2024
+ ******************************************************************************/
+#pragma once
+
+#include <memory>
+
+#include <kassert/kassert.hpp>
+
+#include "kaminpar-common/math.h"
+
+namespace kaminpar {
+
+/*!
+ * A concurrent bit vector which stores bits compactly and uses atomic read/write operations.
+ *
+ * @tparam Size The type of index to use to access bits.
+ */
+template <typename Size> class ConcurrentBitVector {
+public:
+  /*!
+   * Constructs a new empty ConcurrentBitVector.
+   */
+  ConcurrentBitVector() : _size(0), _byte_capacity(0) {}
+
+  /*!
+   * Constructs a new ConcurrentBitVector
+   *
+   * @param size The number of bits to store.
+   */
+  ConcurrentBitVector(const Size size)
+      : _size(size),
+        _byte_capacity(math::div_ceil(size, 8)),
+        _data(std::make_unique<std::uint8_t[]>(_byte_capacity)) {}
+
+  ConcurrentBitVector(const ConcurrentBitVector &) = delete;
+  ConcurrentBitVector &operator=(const ConcurrentBitVector &) = delete;
+
+  ConcurrentBitVector(ConcurrentBitVector &&) noexcept = default;
+  ConcurrentBitVector &operator=(ConcurrentBitVector &&) noexcept = default;
+
+  /*!
+   * Atomically loads a bit.
+   *
+   * @param pos The position of the bit to load.
+   * @return Whether the bit is set.
+   */
+  [[nodiscard]] bool load(const Size pos) const noexcept {
+    KASSERT(pos < _size);
+
+    std::uint8_t *ptr = _data.get() + (pos / 8);
+    const std::uint8_t mask = 1 << (pos % 8);
+    return (__atomic_load_n(ptr, __ATOMIC_RELAXED) & mask) != 0;
+  }
+
+  /*!
+   * Atomically sets a bit.
+   *
+   * @param pos The position of the bit to set.
+   */
+  void set(const Size pos) noexcept {
+    KASSERT(pos < _size);
+
+    std::uint8_t *ptr = _data.get() + (pos / 8);
+    const std::uint8_t mask = 1 << (pos % 8);
+    __atomic_fetch_or(ptr, mask, __ATOMIC_RELAXED);
+  }
+
+  /*!
+   * Atomically unsets a bit.
+   *
+   * @param pos The position of the bit to unset.
+   */
+  void unset(const Size pos) noexcept {
+    KASSERT(pos < _size);
+
+    std::uint8_t *ptr = _data.get() + (pos / 8);
+    const std::uint8_t mask = ~(1 << (pos % 8));
+    __atomic_fetch_and(ptr, mask, __ATOMIC_RELAXED);
+  }
+
+  /*!
+   * Sets (non-atomically) all bits in the vector.
+   */
+  void set_all() noexcept {
+    std::fill(_data.get(), _data.get() + _byte_capacity, 0b11111111);
+  }
+
+  /*!
+   * Resizes the vector.
+   *
+   * @param size The number of bits to store.
+   */
+  void resize(const Size size) {
+    KASSERT(size > 0);
+
+    _size = size;
+    _byte_capacity = math::div_ceil(size, 8);
+    _data = std::make_unique<std::uint8_t[]>(_byte_capacity);
+  }
+
+  /*!
+   * Frees the memory used by this data structure.
+   */
+  void free() {
+    _size = 0;
+    _byte_capacity = 0;
+    _data.release();
+  }
+
+  /*!
+   * Returns the amount of bits that this vector stores.
+   *
+   * @return The amount of bits that this vector stores.
+   */
+  [[nodiscard]] Size size() const noexcept {
+    return _size;
+  }
+
+  /*!
+   * Returns the amount of bits that this vector can store, i.e. the size including internal
+   * fragmentation.
+   *
+   * @return The amount of bits that this vector can store.
+   */
+  [[nodiscard]] Size capacity() const noexcept {
+    return _byte_capacity * 8;
+  }
+
+private:
+  Size _size;
+  Size _byte_capacity;
+  std::unique_ptr<std::uint8_t[]> _data;
+};
+
+} // namespace kaminpar
diff --git a/kaminpar-common/datastructures/concurrent_fast_reset_array.h b/kaminpar-common/datastructures/concurrent_fast_reset_array.h
new file mode 100644
index 00000000..da217402
--- /dev/null
+++ b/kaminpar-common/datastructures/concurrent_fast_reset_array.h
@@ -0,0 +1,120 @@
+/*******************************************************************************
+ * Static array that can reset used elements in O(# of used elements), similar to FastResetArray.
+ * But instead of marking an entry as used when it is accessed, it is marked by the user, otherwise
+ * multiple concurrent accesses to the same value would mark the value as used multiple times.
+ *
+ * @file:   concurrent_fast_reset_array.h
+ * @author: Daniel Salwasser
+ * @date:   29.10.2023
+ ******************************************************************************/
+#pragma once
+
+#include <vector>
+
+#include <tbb/enumerable_thread_specific.h>
+
+#include "kaminpar-common/heap_profiler.h"
+#include "kaminpar-common/parallel/aligned_element.h"
+
+namespace kaminpar {
+
+/*!
+ * A static array that can reset used elements in O(# of used elements).
+ *
+ * @tparam Value The type of value to store.
+ * @tparam Size The type of index to use to access and save values.
+ */
+template <typename Value, typename Size = std::size_t> class ConcurrentFastResetArray {
+public:
+  using value_type = Value;
+  using reference = Value &;
+  using size_type = Size;
+
+  /*!
+   * Constructs a new ConcurrentFastResetArray.
+   *
+   * @param capacity The capacity of the map, i.e. the amount of values to possibly save.
+   */
+  explicit ConcurrentFastResetArray(const std::size_t capacity = 0) : _data(capacity) {
+    RECORD_DATA_STRUCT(capacity * sizeof(value_type), _struct);
+    _used_entries_tls.resize(tbb::this_task_arena::max_concurrency());
+  }
+
+  /*!
+   * Returns the capacity of this array.
+   *
+   * @return The capacity of this array.
+   */
+  std::size_t capacity() const {
+    return _data.capacity();
+  }
+
+  /*!
+   * Returns the thread-local vector of used entries.
+   *
+   * @return The thread-local vector of used entries.
+   */
+  [[nodiscard]] std::vector<size_type> &local_used_entries() {
+    return _used_entries_tls[tbb::this_task_arena::current_thread_index()].vec;
+  }
+
+  /*!
+   * Accesses a value at a position.
+   *
+   * @param pos The position of the value in the map to return. It should be greater or equal then
+   * zero and less then the set capacity.
+   * @return A reference to the value at the position.
+   */
+  [[nodiscard]] reference operator[](const size_type pos) {
+    KASSERT(pos < _data.size());
+    return _data[pos];
+  }
+
+  /*!
+   * Resized the array.
+   *
+   * @param capacity The new capacity of the map, i.e. the amount of values to possibly save.
+   */
+  void resize(const size_type capacity) {
+    IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, capacity * sizeof(value_type)));
+    _data.resize(capacity);
+    _used_entries_tls.resize(tbb::this_task_arena::max_concurrency());
+  }
+
+  /*!
+   * Frees the memory used by this data structure.
+   */
+  void free() {
+    _data.clear();
+    _data.shrink_to_fit();
+
+    _used_entries_tls.clear();
+    _used_entries_tls.shrink_to_fit();
+  }
+
+  /*!
+   * Iterates over all thread-local vector of used entries and clears them afterwards.
+   *
+   * @param l The function object that is invoked with a thread-local vector of used entries before
+   * they are cleared.
+   */
+  template <typename Lambda> void iterate_and_reset(Lambda &&l) {
+    tbb::parallel_for<std::size_t>(0, _used_entries_tls.size(), [&](const auto i) {
+      l(i, _used_entries_tls[i]);
+
+      for (const size_type pos : _used_entries_tls[i]) {
+        _data[pos] = Value();
+      }
+
+      _used_entries_tls[i].clear();
+    });
+  }
+
+private:
+  std::vector<value_type> _data;
+  std::vector<parallel::AlignedVec<std::vector<size_type>>> _used_entries_tls;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
+};
+
+} // namespace kaminpar
diff --git a/kaminpar-common/datastructures/concurrent_two_level_vector.h b/kaminpar-common/datastructures/concurrent_two_level_vector.h
new file mode 100644
index 00000000..056f2c9f
--- /dev/null
+++ b/kaminpar-common/datastructures/concurrent_two_level_vector.h
@@ -0,0 +1,484 @@
+/*******************************************************************************
+ * A two-level vector which stores small values in a contiguous vector and large values in a hash
+ * table.
+ *
+ * @file:   concurrent_two_level_vector.h
+ * @author: Daniel Salwasser
+ * @date:   18.01.2024
+ ******************************************************************************/
+#pragma once
+
+#include <limits>
+
+#include <kassert/kassert.hpp>
+
+#ifdef KAMINPAR_USES_GROWT
+#include <allocator/alignedallocator.hpp>
+#include <data-structures/table_config.hpp>
+#include <utils/hash/murmur2_hash.hpp>
+#else
+#include <tbb/concurrent_hash_map.h>
+#endif
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+namespace kaminpar {
+
+#ifdef KAMINPAR_USES_GROWT
+/*!
+ * A concurrent two-level vector which consists of a vector and a hash table. The data structure
+ * stores values of small size directly in the vector and bigger values in the hash table.
+ *
+ * @tparam Value The type of integer to store.
+ * @tparam Size The type of integer to access the values with.
+ * @tparam FirstValue The type of integer to store in the vector. It has to be smaller than the
+ * value type.
+ */
+template <typename Value, typename Size = std::size_t, typename FirstValue = std::uint16_t>
+class ConcurrentTwoLevelVector {
+  static_assert(std::numeric_limits<Value>::is_integer);
+  static_assert(std::numeric_limits<FirstValue>::is_integer);
+  static_assert(sizeof(FirstValue) < sizeof(Value));
+
+  using HasherType = utils_tm::hash_tm::murmur2_hash;
+  using AllocatorType = ::growt::AlignedAllocator<>;
+  using ConcurrentHashTable = typename ::growt::
+      table_config<std::size_t, Value, HasherType, AllocatorType, hmod::growable>::table_type;
+
+  // The maximum value of the FirstValue type is used as a special marker in the vector to indicate
+  // that the value is stored in the hash table.
+  static constexpr FirstValue kMaxFirstValue = std::numeric_limits<FirstValue>::max();
+
+public:
+  /*!
+   * Constructs a new ConcurrentTwoLevelVector.
+   *
+   * @param capacity The capacity of the vector.
+   */
+  ConcurrentTwoLevelVector(const Size capacity = 0)
+      : _capacity(capacity),
+        _values(capacity),
+        _table(0) {}
+
+  ConcurrentTwoLevelVector(const ConcurrentTwoLevelVector &) = delete;
+  ConcurrentTwoLevelVector &operator=(const ConcurrentTwoLevelVector &) = delete;
+
+  ConcurrentTwoLevelVector(ConcurrentTwoLevelVector &&) noexcept = default;
+  ConcurrentTwoLevelVector &operator=(ConcurrentTwoLevelVector &&) noexcept = default;
+
+  /*!
+   * Returns the number of elements that this vector can hold.
+   *
+   * @return The number of elements that this vector can hold.
+   */
+  [[nodiscard]] Size capacity() const {
+    return _capacity;
+  }
+
+  /*!
+   * Resizes the vector.
+   *
+   * @param capacity The capacity to resize to.
+   */
+  void resize(const Size capacity) {
+    _values.resize(capacity);
+    _capacity = capacity;
+  }
+
+  /*!
+   * Frees the memory used by this data structure.
+   */
+  void free() {
+    _values.free();
+    _table = ConcurrentHashTable(0);
+    _capacity = 0;
+  }
+
+  /*!
+   * Resets the vector such that new elements can be inserted.
+   */
+  void reset() {
+    // As growt does not provide a clear function, just create a new hash table.
+    _table = ConcurrentHashTable(0);
+  }
+
+  /**
+   * Reassigns stored values according to a provided mapping.
+   *
+   * @param mapping The mapping according to which the values are reassigned.
+   * @param new_size The new size of the vector.
+   */
+  void reassign(const StaticArray<Size> &mapping, const Size new_size) {
+    StaticArray<FirstValue> new_values(new_size);
+    ConcurrentHashTable new_table(0);
+
+    tbb::parallel_for(tbb::blocked_range<Size>(0, _values.size()), [&](const auto &r) {
+      for (Size pos = r.begin(); pos != r.end(); ++pos) {
+        const Value value = _values[pos];
+
+        if (value == kMaxFirstValue) {
+          Size new_pos = mapping[pos] - 1;
+          new_values[new_pos] = kMaxFirstValue;
+
+          const Value actual_value = (*_table.get_handle().find(pos)).second;
+          new_table.get_handle().insert(new_pos, value);
+        } else if (value != 0) {
+          Size new_pos = mapping[pos] - 1;
+          new_values[new_pos] = value;
+        }
+      }
+    });
+
+    _values = std::move(new_values);
+    _table = std::move(new_table);
+    _capacity = new_size;
+  }
+
+  /*!
+   * Accesses a value at a given position.
+   *
+   * @param pos The position of the value in the vector to return.
+   * @return The value at the given position.
+   */
+  [[nodiscard]] Value operator[](const Size pos) {
+    KASSERT(pos < _values.size());
+
+    const Value value = _values[pos];
+    if (value < kMaxFirstValue) {
+      return value;
+    }
+
+    auto table_handle = _table.get_handle();
+    auto it = table_handle.find(pos);
+    while (it == table_handle.end()) {
+      it = table_handle.find(pos);
+    }
+
+    return (*it).second;
+  }
+
+  /*!
+   * Inserts a value at a given position.
+   *
+   * @param pos The position in the vector at which the value is to be inserted.
+   * @param value The value to insert.
+   */
+  void insert(const Size pos, const Value value) {
+    KASSERT(pos < _values.size());
+
+    if (value < kMaxFirstValue) {
+      _values[pos] = value;
+    } else {
+      _values[pos] = kMaxFirstValue;
+      _table.get_handle().insert(pos, value);
+    }
+  }
+
+  /**
+   * Adds atomically a value to a value already stored in the vector.
+   *
+   * @param pos The position in the vector at which the value is to be added.
+   * @param delta The value to add.
+   */
+  void atomic_add(const Size pos, const Value delta) {
+    KASSERT(pos < _values.size());
+
+    FirstValue value = _values[pos];
+    bool success;
+    do {
+      if (value == kMaxFirstValue) {
+        _table.get_handle().insert_or_update(
+            pos, delta, [&](auto &lhs, const auto rhs) { return lhs += rhs; }, delta
+        );
+        break;
+      }
+
+      const Value new_value = static_cast<Value>(value) + delta;
+      if (new_value < kMaxFirstValue) {
+        success = __atomic_compare_exchange_n(
+            &_values[pos], &value, new_value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED
+        );
+      } else {
+        success = __atomic_compare_exchange_n(
+            &_values[pos], &value, kMaxFirstValue, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED
+        );
+
+        if (success) {
+          _table.get_handle().insert_or_update(
+              pos, new_value, [&](auto &lhs, const auto rhs) { return lhs += rhs; }, new_value
+          );
+          break;
+        }
+      }
+
+    } while (!success);
+  }
+
+  /**
+   * Subtracts atomically a value from a value already stored in the vector.
+   *
+   * @param pos The position in the vector at which the value is to be subtracted.
+   * @param delta The value to subtract.
+   */
+  void atomic_sub(const Size pos, const Value delta) {
+    KASSERT(pos < _values.size());
+
+    FirstValue value = _values[pos];
+    bool success;
+    do {
+      if (value == kMaxFirstValue) {
+        _table.get_handle().insert_or_update(
+            pos, -delta, [&](auto &lhs, const auto rhs) { return lhs -= rhs; }, delta
+        );
+        break;
+      }
+
+      success = __atomic_compare_exchange_n(
+          &_values[pos], &value, value - delta, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED
+      );
+    } while (!success);
+  }
+
+private:
+  Size _capacity;
+  StaticArray<FirstValue> _values;
+  ConcurrentHashTable _table;
+};
+#else
+/*!
+ * A concurrent two-level vector which consists of a vector and a hash table. The data structure
+ * stores values of small size directly in the vector and bigger values in the hash table.
+ *
+ * @tparam Value The type of integer to store.
+ * @tparam Size The type of integer to access the values with.
+ * @tparam FirstValue The type of integer to store in the vector. It has to be smaller than the
+ * value type.
+ */
+template <typename Value, typename Size = std::size_t, typename FirstValue = std::uint16_t>
+class ConcurrentTwoLevelVector {
+  static_assert(std::numeric_limits<Value>::is_integer);
+  static_assert(std::numeric_limits<FirstValue>::is_integer);
+  static_assert(sizeof(FirstValue) < sizeof(Value));
+
+  using ConcurrentHashTable = tbb::concurrent_hash_map<Size, Value>;
+
+  // The maximum value of the FirstValue type is used as a special marker in the vector to indicate
+  // that the value is stored in the hash table.
+  static constexpr FirstValue kMaxFirstValue = std::numeric_limits<FirstValue>::max();
+
+public:
+  /*!
+   * Constructs a new ConcurrentTwoLevelVector.
+   *
+   * @param capacity The capacity of the vector.
+   */
+  ConcurrentTwoLevelVector(const Size capacity = 0) : _capacity(capacity), _values(capacity) {}
+
+  ConcurrentTwoLevelVector(const ConcurrentTwoLevelVector &) = delete;
+  ConcurrentTwoLevelVector &operator=(const ConcurrentTwoLevelVector &) = delete;
+
+  ConcurrentTwoLevelVector(ConcurrentTwoLevelVector &&) noexcept = default;
+  ConcurrentTwoLevelVector &operator=(ConcurrentTwoLevelVector &&) noexcept = default;
+
+  /*!
+   * Returns the number of elements that this vector can hold.
+   *
+   * @return The number of elements that this vector can hold.
+   */
+  [[nodiscard]] Size capacity() const {
+    return _capacity;
+  }
+
+  /*!
+   * Resizes the vector.
+   *
+   * @param capacity The capacity to resize to.
+   */
+  void resize(const Size capacity) {
+    _values.resize(capacity);
+    _capacity = capacity;
+  }
+
+  /*!
+   * Frees the memory used by this data structure.
+   */
+  void free() {
+    _values.free();
+    _table.clear();
+    _capacity = 0;
+  }
+
+  /*!
+   * Resets the vector such that new elements can be inserted.
+   */
+  void reset() {
+    _table.clear();
+  }
+
+  /**
+   * Reassigns stored values according to a provided mapping.
+   *
+   * @param mapping The mapping according to which the values are reassigned.
+   * @param new_size The new size of the vector.
+   */
+  void reassign(const StaticArray<Size> &mapping, const Size new_size) {
+    StaticArray<FirstValue> new_values(new_size);
+    ConcurrentHashTable new_table;
+
+    tbb::parallel_for(tbb::blocked_range<Size>(0, _values.size()), [&](const auto &r) {
+      for (Size pos = r.begin(); pos != r.end(); ++pos) {
+        const Value value = _values[pos];
+
+        if (value == kMaxFirstValue) {
+          Size new_pos = mapping[pos] - 1;
+          new_values[new_pos] = kMaxFirstValue;
+
+          const Value actual_value = [&] {
+            typename ConcurrentHashTable::const_accessor entry;
+            _table.find(entry, pos);
+            return entry->second;
+          }();
+
+          typename ConcurrentHashTable::accessor entry;
+          new_table.insert(entry, new_pos);
+          entry->second = actual_value;
+        } else if (value != 0) {
+          Size new_pos = mapping[pos] - 1;
+          new_values[new_pos] = value;
+        }
+      }
+    });
+
+    _values = std::move(new_values);
+    _table = std::move(new_table);
+    _capacity = new_size;
+  }
+
+  /*!
+   * Accesses a value at a given position.
+   *
+   * @param pos The position of the value in the vector to return.
+   * @return The value at the given position.
+   */
+  [[nodiscard]] Value operator[](const Size pos) {
+    KASSERT(pos < _values.size());
+
+    const Value value = _values[pos];
+    if (value < kMaxFirstValue) {
+      return value;
+    }
+
+    typename ConcurrentHashTable::const_accessor entry;
+    bool found;
+    do {
+      found = _table.find(entry, pos);
+    } while (!found);
+
+    return entry->second;
+  }
+
+  /*!
+   * Inserts a value at a given position.
+   *
+   * @param pos The position in the vector at which the value is to be inserted.
+   * @param value The value to insert.
+   */
+  void insert(const Size pos, const Value value) {
+    KASSERT(pos < _values.size());
+
+    if (value < kMaxFirstValue) {
+      _values[pos] = value;
+    } else {
+      _values[pos] = kMaxFirstValue;
+
+      typename ConcurrentHashTable::accessor entry;
+      _table.insert(entry, pos);
+      entry->second = value;
+    }
+  }
+
+  /**
+   * Adds atomically a value to a value already stored in the vector.
+   *
+   * @param pos The position in the vector at which the value is to be added.
+   * @param delta The value to add.
+   */
+  void atomic_add(const Size pos, const Value delta) {
+    KASSERT(pos < _values.size());
+
+    FirstValue value = _values[pos];
+    bool success;
+    do {
+      if (value == kMaxFirstValue) {
+        typename ConcurrentHashTable::accessor entry;
+        if (_table.insert(entry, pos)) {
+          entry->second = delta;
+        } else {
+          entry->second += delta;
+        }
+
+        break;
+      }
+
+      const Value new_value = static_cast<Value>(value) + delta;
+      if (new_value < kMaxFirstValue) {
+        success = __atomic_compare_exchange_n(
+            &_values[pos], &value, new_value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED
+        );
+      } else {
+        success = __atomic_compare_exchange_n(
+            &_values[pos], &value, kMaxFirstValue, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED
+        );
+
+        if (success) {
+          typename ConcurrentHashTable::accessor entry;
+          if (_table.insert(entry, pos)) {
+            entry->second = new_value;
+          } else {
+            entry->second += new_value;
+          }
+
+          break;
+        }
+      }
+
+    } while (!success);
+  }
+
+  /**
+   * Subtracts atomically a value from a value already stored in the vector.
+   *
+   * @param pos The position in the vector at which the value is to be subtracted.
+   * @param delta The value to subtract.
+   */
+  void atomic_sub(const Size pos, const Value delta) {
+    KASSERT(pos < _values.size());
+
+    FirstValue value = _values[pos];
+    bool success;
+    do {
+      if (value == kMaxFirstValue) {
+        typename ConcurrentHashTable::accessor entry;
+        if (_table.insert(entry, pos)) {
+          entry->second = -delta;
+        } else {
+          entry->second -= delta;
+        }
+
+        break;
+      }
+
+      success = __atomic_compare_exchange_n(
+          &_values[pos], &value, value - delta, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED
+      );
+    } while (!success);
+  }
+
+private:
+  Size _capacity;
+  StaticArray<FirstValue> _values;
+  ConcurrentHashTable _table;
+};
+#endif
+
+} // namespace kaminpar
diff --git a/kaminpar-common/datastructures/fast_reset_array.h b/kaminpar-common/datastructures/fast_reset_array.h
index 1779a0dc..87142e8e 100644
--- a/kaminpar-common/datastructures/fast_reset_array.h
+++ b/kaminpar-common/datastructures/fast_reset_array.h
@@ -11,6 +11,8 @@
 #include <vector>
 
 #include "kaminpar-common/assert.h"
+#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/ranges.h"
 
 namespace kaminpar {
@@ -21,7 +23,9 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
   using const_reference = const Value &;
   using size_type = Size;
 
-  explicit FastResetArray(const std::size_t capacity = 0) : _data(capacity) {}
+  explicit FastResetArray(const std::size_t capacity = 0) : _data(capacity) {
+    RECORD_DATA_STRUCT(capacity * sizeof(value_type), _struct);
+  }
 
   FastResetArray(const FastResetArray &) = delete;
   FastResetArray &operator=(const FastResetArray &) = delete;
@@ -30,9 +34,18 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
 
   reference operator[](const size_type pos) {
     KASSERT(pos < _data.size());
+
     if (_data[pos] == Value()) {
       _used_entries.push_back(pos);
+
+      IF_HEAP_PROFILING(
+          _struct->size = std::max(
+              _struct->size,
+              _data.capacity() * sizeof(value_type) + _used_entries.capacity() * sizeof(size_type)
+          )
+      );
     }
+
     return _data[pos];
   }
   const_reference operator[](const size_type pos) const {
@@ -90,6 +103,13 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
   }
   void resize(const std::size_t capacity) {
     _data.resize(capacity);
+
+    IF_HEAP_PROFILING(
+        _struct->size = std::max(
+            _struct->size,
+            _data.capacity() * sizeof(value_type) + _used_entries.capacity() * sizeof(size_type)
+        )
+    );
   }
 
   [[nodiscard]] std::size_t memory_in_kb() const {
@@ -97,7 +117,9 @@ template <typename Value, typename Size = std::size_t> class FastResetArray {
   }
 
 private:
-  std::vector<value_type> _data;
+  scalable_vector<value_type> _data;
   std::vector<size_type> _used_entries{};
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 } // namespace kaminpar
diff --git a/kaminpar-common/datastructures/fixed_size_sparse_map.h b/kaminpar-common/datastructures/fixed_size_sparse_map.h
index 6ed5c996..66b62213 100644
--- a/kaminpar-common/datastructures/fixed_size_sparse_map.h
+++ b/kaminpar-common/datastructures/fixed_size_sparse_map.h
@@ -30,6 +30,7 @@
 #include <memory>
 
 #include "kaminpar-common/assert.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/math.h"
 
 namespace kaminpar {
@@ -72,6 +73,7 @@ class FixedSizeSparseMap {
         _timestamp(1),
         _sparse(nullptr),
         _dense(nullptr) {
+    RECORD_DATA_STRUCT(0, _struct);
     allocate(MAP_SIZE);
   }
 
@@ -83,6 +85,7 @@ class FixedSizeSparseMap {
         _timestamp(1),
         _sparse(nullptr),
         _dense(nullptr) {
+    RECORD_DATA_STRUCT(0, _struct);
     allocate(max_size);
   }
 
@@ -188,6 +191,12 @@ class FixedSizeSparseMap {
       _sparse = reinterpret_cast<SparseElement *>(_data.get());
       _dense = reinterpret_cast<Element *>(_data.get() + +sizeof(SparseElement) * _map_size);
       std::memset(_data.get(), 0, _map_size * (sizeof(Element) + sizeof(SparseElement)));
+
+      IF_HEAP_PROFILING(
+          _struct->size = std::max(
+              _struct->size, _map_size * sizeof(Element) + _map_size * sizeof(SparseElement)
+          )
+      );
     }
   }
 
@@ -203,5 +212,7 @@ class FixedSizeSparseMap {
   std::size_t _timestamp;
   SparseElement *_sparse;
   Element *_dense;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 } // namespace kaminpar
diff --git a/kaminpar-common/datastructures/marker.h b/kaminpar-common/datastructures/marker.h
index c444a44d..0391a49e 100644
--- a/kaminpar-common/datastructures/marker.h
+++ b/kaminpar-common/datastructures/marker.h
@@ -14,14 +14,22 @@
 
 #include "kaminpar-common/assert.h"
 
+#include "kaminpar-common/heap_profiler.h"
+
 namespace kaminpar {
 template <std::size_t num_concurrent_markers = 1, typename element_type = std::size_t>
 class Marker {
 public:
+  explicit Marker() : _marker_id(0), _first_unmarked_element{0} {
+    RECORD_DATA_STRUCT(0, _struct);
+  }
+
   explicit Marker(const std::size_t capacity)
       : _data(capacity),
         _marker_id(0),
-        _first_unmarked_element{0} {}
+        _first_unmarked_element{0} {
+    RECORD_DATA_STRUCT(capacity * sizeof(element_type), _struct);
+  }
 
   Marker(const Marker &) = delete;
   Marker &operator=(const Marker &) = delete;
@@ -79,6 +87,7 @@ class Marker {
   }
 
   void resize(const std::size_t capacity) {
+    IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, capacity * sizeof(element_type)));
     _data.resize(capacity);
   }
 
@@ -90,5 +99,7 @@ class Marker {
   std::vector<element_type> _data;
   element_type _marker_id;
   std::array<std::size_t, num_concurrent_markers> _first_unmarked_element;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 } // namespace kaminpar
diff --git a/kaminpar-common/datastructures/queue.h b/kaminpar-common/datastructures/queue.h
index d644582d..04dd619e 100644
--- a/kaminpar-common/datastructures/queue.h
+++ b/kaminpar-common/datastructures/queue.h
@@ -10,6 +10,8 @@
 
 #include "kaminpar-common/assert.h"
 
+#include "kaminpar-common/heap_profiler.h"
+
 namespace kaminpar {
 /*!
  * Queue with fixed capacity. Add new elements to its tail and remove elements
@@ -28,7 +30,9 @@ template <typename T> class Queue {
   using iterator = typename std::vector<T>::iterator;
   using const_iterator = typename std::vector<T>::const_iterator;
 
-  explicit Queue(const std::size_t capacity) : _data(capacity) {}
+  explicit Queue(const std::size_t capacity) : _data(capacity) {
+    RECORD_DATA_STRUCT(capacity * sizeof(T), _struct);
+  }
 
   Queue(const Queue &) = delete;
   Queue &operator=(const Queue &) = delete;
@@ -88,6 +92,7 @@ template <typename T> class Queue {
   }
 
   void resize(const std::size_t capacity) {
+    IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, capacity * sizeof(T)));
     _data.resize(capacity);
     clear();
   }
@@ -125,5 +130,7 @@ template <typename T> class Queue {
   std::vector<T> _data;
   std::size_t _head = 0;
   std::size_t _tail = 0;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 } // namespace kaminpar
diff --git a/kaminpar-common/datastructures/rating_map.h b/kaminpar-common/datastructures/rating_map.h
index 7b5e1203..d95cebd4 100644
--- a/kaminpar-common/datastructures/rating_map.h
+++ b/kaminpar-common/datastructures/rating_map.h
@@ -110,7 +110,13 @@ class RatingMap {
     LARGE
   };
 
-  explicit RatingMap(const std::size_t max_size) : _max_size{max_size} {}
+  explicit RatingMap(const std::size_t max_size = 0) : _max_size{max_size} {}
+
+  RatingMap(const RatingMap &) = delete;
+  RatingMap &operator=(const RatingMap &) = delete;
+
+  RatingMap(RatingMap &&) noexcept = default;
+  RatingMap &operator=(RatingMap &&) noexcept = default;
 
   MapType update_upper_bound(const std::size_t upper_bound_size) {
     select_map(upper_bound_size);
@@ -133,6 +139,10 @@ class RatingMap {
     __builtin_unreachable();
   }
 
+  [[nodiscard]] SmallMap &small_map() {
+    return _small_map;
+  }
+
   [[nodiscard]] std::size_t small_map_counter() const {
     return _small_map_counter;
   }
diff --git a/kaminpar-common/datastructures/scalable_vector.h b/kaminpar-common/datastructures/scalable_vector.h
index b97d754b..05d602a7 100644
--- a/kaminpar-common/datastructures/scalable_vector.h
+++ b/kaminpar-common/datastructures/scalable_vector.h
@@ -5,12 +5,22 @@
 #include "kaminpar-common/datastructures/noinit_vector.h"
 
 namespace kaminpar {
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+// @deprecated
+template <typename T> using scalable_vector = std::vector<T>;
+#else
 // @deprecated
 template <typename T> using scalable_vector = std::vector<T, tbb::scalable_allocator<T>>;
+#endif
 
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+// @deprecated
+template <typename T> using scalable_noinit_vector = std::vector<T, NoinitAllocator<T>>;
+#else
 // @deprecated
 template <typename T>
 using scalable_noinit_vector = std::vector<T, NoinitAllocator<T, tbb::scalable_allocator<T>>>;
+#endif
 
 template <typename T> using ScalableVector = scalable_vector<T>;
 
diff --git a/kaminpar-common/datastructures/sparse_map.h b/kaminpar-common/datastructures/sparse_map.h
index 1e13040b..d5ca6128 100644
--- a/kaminpar-common/datastructures/sparse_map.h
+++ b/kaminpar-common/datastructures/sparse_map.h
@@ -32,6 +32,8 @@
 
 #include "kaminpar-common/assert.h"
 
+#include "kaminpar-common/heap_profiler.h"
+
 namespace kaminpar {
 template <typename Key, typename Value> class SparseMap {
   struct Element {
@@ -40,9 +42,12 @@ template <typename Key, typename Value> class SparseMap {
   };
 
 public:
-  SparseMap() = default;
+  SparseMap() {
+    RECORD_DATA_STRUCT(0, _struct);
+  }
 
   explicit SparseMap(const std::size_t capacity) : _capacity(capacity) {
+    RECORD_DATA_STRUCT(0, _struct);
     allocate_data(capacity);
   }
 
@@ -144,6 +149,8 @@ template <typename Key, typename Value> class SparseMap {
     _data = std::make_unique<std::size_t[]>(num_elements);
     _sparse = reinterpret_cast<std::size_t *>(_data.get());
     _dense = reinterpret_cast<Element *>(_sparse + _capacity);
+
+    IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, num_elements * sizeof(std::size_t)));
   }
 
   std::size_t _capacity = 0;
@@ -151,5 +158,7 @@ template <typename Key, typename Value> class SparseMap {
   std::unique_ptr<std::size_t[]> _data = nullptr;
   std::size_t *_sparse = nullptr;
   Element *_dense = nullptr;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 } // namespace kaminpar
diff --git a/kaminpar-common/datastructures/static_array.h b/kaminpar-common/datastructures/static_array.h
index e1126c2a..2cc29a00 100644
--- a/kaminpar-common/datastructures/static_array.h
+++ b/kaminpar-common/datastructures/static_array.h
@@ -7,6 +7,7 @@
 #pragma once
 
 #include <cstring>
+#include <initializer_list>
 #include <iterator>
 #include <thread>
 #include <vector>
@@ -14,7 +15,7 @@
 #include <tbb/parallel_for.h>
 
 #include "kaminpar-common/assert.h"
-#include "kaminpar-common/parallel/atomic.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/parallel/tbb_malloc.h"
 
 namespace kaminpar {
@@ -127,27 +128,27 @@ template <typename T> class StaticArray {
   using iterator = StaticArrayIterator;
   using const_iterator = const StaticArrayIterator;
 
-  struct no_init {};
-
-  StaticArray(T *storage, const std::size_t size) : _size(size), _data(storage) {}
+  StaticArray(T *storage, const std::size_t size) : _size(size), _data(storage) {
+    RECORD_DATA_STRUCT(size * sizeof(T), _struct);
+  }
 
   StaticArray(const std::size_t start, const std::size_t size, StaticArray &data)
       : StaticArray(size, data._data + start) {
     KASSERT(start + size <= data.size());
   }
 
-  StaticArray(const std::size_t size, value_type *data) : _size(size), _data(data) {}
+  StaticArray(const std::size_t size, value_type *data) : _size(size), _data(data) {
+    RECORD_DATA_STRUCT(size * sizeof(T), _struct);
+  }
 
   StaticArray(const std::size_t size, const value_type init_value = value_type()) {
+    RECORD_DATA_STRUCT(0, _struct);
     resize(size, init_value);
   }
 
-  StaticArray(const std::size_t size, no_init) {
-    resize(size, no_init{});
-  }
-
-  StaticArray(static_array::noinit_t, const std::size_t size) {
-    resize(size, no_init{});
+  StaticArray(const std::size_t size, static_array::noinit_t) {
+    RECORD_DATA_STRUCT(0, _struct);
+    resize(size, static_array::noinit);
   }
 
   template <typename Iterator>
@@ -174,6 +175,18 @@ template <typename T> class StaticArray {
   // Data access members
   //
 
+  void write(const size_type pos, const_reference value) {
+    at(pos) = value;
+  }
+
+  reference at(const size_type pos) {
+    return _data[pos];
+  }
+
+  const_reference at(const size_type pos) const {
+    return _data[pos];
+  }
+
   reference operator[](const size_type pos) {
     KASSERT(pos < _size);
     return _data[pos];
@@ -270,12 +283,7 @@ template <typename T> class StaticArray {
     return _size;
   }
 
-  void resize(static_array::noinit_t, const std::size_t size) {
-    KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always);
-    allocate_data(size);
-  }
-
-  void resize(const std::size_t size, no_init) {
+  void resize(const std::size_t size, static_array::noinit_t) {
     KASSERT(_data == _owned_data.get(), "cannot resize span", assert::always);
     allocate_data(size);
   }
@@ -285,7 +293,7 @@ template <typename T> class StaticArray {
       const value_type init_value = value_type(),
       const bool assign_parallel = true
   ) {
-    resize(size, no_init{});
+    resize(size, static_array::noinit);
     assign(size, init_value, assign_parallel);
   }
 
@@ -319,49 +327,25 @@ template <typename T> class StaticArray {
     _data = _owned_data.get();
     _size = size;
     _unrestricted_size = _size;
+
+    IF_HEAP_PROFILING(_struct->size = std::max(_struct->size, size * sizeof(value_type)));
   }
 
   size_type _size = 0;
   size_type _unrestricted_size = 0;
   parallel::tbb_unique_ptr<value_type> _owned_data = nullptr;
   value_type *_data = nullptr;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 
 namespace static_array {
-template <typename T> StaticArray<T> copy(const StaticArray<T> &arr) {
-  StaticArray<T> cpy(arr.size());
-  tbb::parallel_for<std::size_t>(0, arr.size(), [&](const std::size_t i) { cpy[i] = arr[i]; });
-  return cpy;
-}
-
-template <typename T> StaticArray<T> create_from(const std::vector<T> &vec) {
-  StaticArray<T> arr(vec.size());
-  std::copy(vec.begin(), vec.end(), arr.begin());
-  return arr;
+template <typename T> StaticArray<T> create(std::initializer_list<T> list) {
+  return {list.begin(), list.end()};
 }
 
-template <typename T>
-StaticArray<parallel::Atomic<T>> create_atomic_from(const std::vector<T> &vec) {
-  StaticArray<parallel::Atomic<T>> arr(vec.size());
-  for (std::size_t i = 0; i < vec.size(); ++i) {
-    arr[i].store(vec[i]);
-  }
-  return arr;
-}
-
-template <typename T> std::vector<T> release(const StaticArray<T> &arr) {
-  std::vector<T> vec(arr.size());
-  std::copy(arr.begin(), arr.end(), vec.begin());
-  return vec;
-}
-
-template <typename T>
-std::vector<T> release_nonatomic(const StaticArray<parallel::Atomic<T>> &arr) {
-  std::vector<T> vec(arr.size());
-  for (std::size_t i = 0; i < arr.size(); ++i) {
-    vec[i] = arr[i].load();
-  }
-  return vec;
+template <typename T> StaticArray<T> create(const std::vector<T> &vec) {
+  return {vec.begin(), vec.end()};
 }
 } // namespace static_array
 } // namespace kaminpar
diff --git a/kaminpar-common/datastructures/ts_navigable_linked_list.h b/kaminpar-common/datastructures/ts_navigable_linked_list.h
index c9891673..cee2bd59 100644
--- a/kaminpar-common/datastructures/ts_navigable_linked_list.h
+++ b/kaminpar-common/datastructures/ts_navigable_linked_list.h
@@ -91,10 +91,16 @@ template <typename Key, typename Element, template <typename> typename Container
 using NavigationMarker = typename LocalNavigableLinkedList<Key, Element, Container>::Marker;
 
 namespace ts_navigable_list {
-template <typename Key, typename Element, template <typename> typename Container>
-Container<NavigationMarker<Key, Element, Container>> combine(
+template <
+    typename Key,
+    typename Element,
+    template <typename>
+    typename Container,
+    template <typename>
+    typename GlobalContainer>
+GlobalContainer<NavigationMarker<Key, Element, Container>> combine(
     NavigableLinkedList<Key, Element, Container> &list,
-    Container<NavigationMarker<Key, Element, Container>> global_markers = {}
+    GlobalContainer<NavigationMarker<Key, Element, Container>> global_markers = {}
 ) {
   parallel::Atomic<std::size_t> global_pos = 0;
   std::size_t num_markers = 0;
diff --git a/kaminpar-common/heap_profiler.cc b/kaminpar-common/heap_profiler.cc
new file mode 100644
index 00000000..de86ae3c
--- /dev/null
+++ b/kaminpar-common/heap_profiler.cc
@@ -0,0 +1,332 @@
+/*******************************************************************************
+ * Heap profiler to measure heap memory usage.
+ *
+ * @file:   heap_profiler.cc
+ * @author: Daniel Salwasser
+ * @date:   21.10.2023
+ ******************************************************************************/
+#include "kaminpar-common/heap_profiler.h"
+
+#include <algorithm>
+
+#include <kassert/kassert.hpp>
+
+namespace kaminpar::heap_profiler {
+
+HeapProfiler &HeapProfiler::global() {
+  static HeapProfiler global("Global Heap Profiler");
+  return global;
+}
+
+HeapProfiler::HeapProfiler(std::string_view name) : _tree(name) {}
+
+HeapProfiler::~HeapProfiler() {
+  _tree.root.free(_node_allocator, _struct_allocator);
+}
+
+void HeapProfiler::enable() {
+  _enabled = true;
+}
+
+void HeapProfiler::disable() {
+  _enabled = false;
+}
+
+void HeapProfiler::start_profile(std::string_view name, std::string desc) {
+  if (_enabled) {
+    std::lock_guard<std::mutex> guard(_mutex);
+
+    HeapProfileTreeNode *node = _node_allocator.create(name, desc, _tree.currentNode);
+    _tree.currentNode->children.push_back(node);
+    _tree.currentNode = node;
+  }
+}
+
+void HeapProfiler::stop_profile() {
+  if (_enabled) {
+    KASSERT(_tree.currentNode->parent != nullptr, "The root heap profile cannot be stopped.");
+    std::lock_guard<std::mutex> guard(_mutex);
+
+    _tree.currentNode = _tree.currentNode->parent;
+  }
+}
+
+ScopedHeapProfiler HeapProfiler::start_scoped_profile(std::string_view name, std::string desc) {
+  return ScopedHeapProfiler(name, desc);
+}
+
+void HeapProfiler::record_data_struct(
+    std::string_view var_name, std::string_view file_name, std::size_t line
+) {
+  if (_enabled) {
+    _var_name = var_name;
+    _file_name = file_name;
+    _line = line;
+  }
+}
+
+DataStructure *HeapProfiler::add_data_struct(std::string name, std::size_t size) {
+  if (_enabled) {
+    std::lock_guard<std::mutex> guard(_mutex);
+
+    DataStructure *data_structure = _struct_allocator.create(std::move(name), size);
+    if (_line != 0) {
+      data_structure->variable_name = _var_name;
+      data_structure->file_name = _file_name;
+      data_structure->line = _line;
+
+      _line = 0;
+    }
+
+    _tree.currentNode->data_structures.push_back(data_structure);
+    return data_structure;
+  }
+
+  return new DataStructure(std::move(name), size);
+}
+
+void HeapProfiler::record_alloc(const void *ptr, std::size_t size) {
+  if (_enabled) {
+    std::lock_guard<std::mutex> guard(_mutex);
+
+    for (HeapProfileTreeNode *node = _tree.currentNode; node != nullptr; node = node->parent) {
+      node->allocs++;
+      node->alloc_size += size;
+
+      if (std::size_t current_alloc = node->alloc_size - node->free_size;
+          node->alloc_size > node->free_size && current_alloc > node->max_alloc_size) {
+        node->max_alloc_size = current_alloc;
+      }
+    }
+
+    _address_map.insert_or_assign(ptr, size);
+  }
+}
+
+void HeapProfiler::record_free(const void *ptr) {
+  if (_enabled) {
+    std::lock_guard<std::mutex> guard(_mutex);
+
+    if (auto search = _address_map.find(ptr); search != _address_map.end()) {
+      std::size_t size = search->second;
+      for (HeapProfileTreeNode *node = _tree.currentNode; node != nullptr; node = node->parent) {
+        node->frees++;
+        node->free_size += size;
+      }
+
+      _address_map.erase(search);
+    }
+  }
+}
+
+void HeapProfiler::set_detailed_summary_options() {
+  set_max_depth(std::numeric_limits<std::size_t>::max());
+  set_print_data_structs(true);
+  set_min_data_struct_size(1);
+}
+
+void HeapProfiler::set_max_depth(std::size_t max_depth) {
+  _max_depth = max_depth;
+}
+
+void HeapProfiler::set_print_data_structs(bool print) {
+  _print_data_structs = print;
+}
+
+void HeapProfiler::set_min_data_struct_size(float size) {
+  _min_data_struct_size = static_cast<std::size_t>(size * 1024 * 1024);
+}
+
+void HeapProfiler::print_heap_profile(std::ostream &out) {
+  HeapProfileTreeNode &root = *_tree.currentNode;
+  HeapProfileTreeStats stats(root);
+
+  stats.max_alloc_size =
+      std::max(kMaxAllocTitle.length(), to_megabytes(stats.max_alloc_size).length());
+  stats.alloc_size = std::max(kAllocTitle.length(), to_megabytes(stats.alloc_size).length());
+  stats.free_size = std::max(kAllocTitle.length(), to_megabytes(stats.free_size).length());
+  stats.allocs = std::max(kAllocsTitle.length(), std::to_string(stats.allocs).length());
+  stats.frees = std::max(kFreesTitle.length(), std::to_string(stats.frees).length());
+
+  out << std::string(stats.len + kNameDel.length() + kPercentageLength - 1, kHeadingPadding) << ' ';
+  out << kMaxAllocTitle << std::string(stats.max_alloc_size - kMaxAllocTitle.length() + 1, ' ');
+  out << kAllocTitle << std::string(stats.alloc_size - kAllocTitle.length() + 1, ' ');
+  out << kFreeTitle << std::string(stats.free_size - kFreeTitle.length() + 1, ' ');
+  out << kAllocsTitle << std::string(stats.allocs - kAllocsTitle.length() + 1, ' ');
+  out << kFreesTitle << std::string(stats.frees - kFreesTitle.length() + 1, ' ');
+  out << '\n';
+
+  print_heap_tree_node(out, root, stats, _max_depth, _print_data_structs, _min_data_struct_size);
+  out << '\n';
+}
+
+std::size_t HeapProfiler::get_max_alloc() {
+  return _tree.currentNode->max_alloc_size;
+}
+
+std::size_t HeapProfiler::get_alloc() {
+  return _tree.currentNode->alloc_size;
+}
+
+std::size_t HeapProfiler::get_free() {
+  return _tree.currentNode->free_size;
+}
+
+std::size_t HeapProfiler::get_allocs() {
+  return _tree.currentNode->allocs;
+}
+
+std::size_t HeapProfiler::get_frees() {
+  return _tree.currentNode->frees;
+}
+
+void HeapProfiler::print_heap_tree_node(
+    std::ostream &out,
+    const HeapProfileTreeNode &node,
+    const HeapProfileTreeStats stats,
+    std::size_t max_depth,
+    bool print_data_structs,
+    std::size_t min_data_struct_size,
+    std::size_t depth,
+    bool last
+) {
+  if (depth > max_depth) {
+    return;
+  }
+
+  print_indentation(out, depth, last);
+  print_percentage(out, node);
+
+  out << node.name;
+
+  std::size_t padding_length = stats.len - (depth * kBranchLength + node.name.length());
+  if (!node.description.empty()) {
+    padding_length -= node.description.length() + 2;
+    out << '(' << node.description << ')';
+  }
+
+  out << kNameDel;
+  if (padding_length > 0) {
+    out << std::string(padding_length - 1, kPadding) << ' ';
+  }
+
+  print_statistics(out, node, stats);
+  if (print_data_structs) {
+    print_data_structures(out, node, depth, node.children.empty(), min_data_struct_size);
+  }
+
+  if (!node.children.empty()) {
+    const auto last_child = node.children.back();
+
+    for (auto const &child : node.children) {
+      const bool is_last = (child == last_child);
+      print_heap_tree_node(
+          out,
+          *child,
+          stats,
+          max_depth,
+          print_data_structs,
+          min_data_struct_size,
+          depth + 1,
+          is_last
+      );
+    }
+  }
+}
+
+void HeapProfiler::print_indentation(std::ostream &out, std::size_t depth, bool last) {
+  if (depth > 0) {
+    std::size_t leading_whitespaces = (depth - 1) * kBranchLength;
+    out << std::string(leading_whitespaces, ' ') << (last ? kTailBranch : kBranch);
+  }
+}
+
+void HeapProfiler::print_percentage(std::ostream &out, const HeapProfileTreeNode &node) {
+  std::size_t parent_alloc_size = node.parent == nullptr ? 0 : node.parent->alloc_size;
+  float percentage = (parent_alloc_size == 0) ? 1 : (node.alloc_size / (float)parent_alloc_size);
+
+  out << "(";
+
+  if (percentage >= 0.999995) {
+    out << "100.00";
+  } else {
+    if (percentage < 0.1) {
+      out << "0";
+    }
+
+    out << percentage * 100;
+  }
+
+  out << "%) ";
+}
+
+void HeapProfiler::print_statistics(
+    std::ostream &out, const HeapProfileTreeNode &node, const HeapProfileTreeStats stats
+) {
+  std::string max_alloc_size = to_megabytes(node.max_alloc_size);
+  out << max_alloc_size << std::string(stats.max_alloc_size - max_alloc_size.length() + 1, ' ');
+
+  std::string alloc_size = to_megabytes(node.alloc_size);
+  out << alloc_size << std::string(stats.alloc_size - alloc_size.length() + 1, ' ');
+
+  std::string free_size = to_megabytes(node.free_size);
+  out << free_size << std::string(stats.free_size - free_size.length() + 1, ' ');
+
+  out << node.allocs << std::string(stats.allocs - std::to_string(node.allocs).length() + 1, ' ')
+      << node.frees << std::string(stats.frees - std::to_string(node.frees).length(), ' ') << '\n';
+}
+
+void HeapProfiler::print_data_structures(
+    std::ostream &out,
+    const HeapProfileTreeNode &node,
+    std::size_t depth,
+    bool last,
+    std::size_t min_data_struct_size
+) {
+  std::vector<DataStructure *, NoProfilAllocator<DataStructure *>> filtered_data_structures;
+  std::copy_if(
+      node.data_structures.begin(),
+      node.data_structures.end(),
+      std::back_inserter(filtered_data_structures),
+      [&](auto *data_structure) { return data_structure->size >= min_data_struct_size; }
+  );
+
+  if (filtered_data_structures.empty()) {
+    return;
+  }
+
+  std::sort(
+      filtered_data_structures.begin(),
+      filtered_data_structures.end(),
+      [](auto *d1, auto *d2) { return d1->size > d2->size; }
+  );
+
+  auto last_data_structure = filtered_data_structures.back();
+  for (auto data_structure : filtered_data_structures) {
+    const bool is_last = last && (data_structure == last_data_structure);
+    const bool has_info = data_structure->line > 0;
+
+    std::size_t leading_whitespaces = depth * kBranchLength;
+    out << std::string(leading_whitespaces, ' ') << (is_last ? kTailBranch : kBranch);
+
+    std::size_t max_alloc_size = node.max_alloc_size;
+    float percentage = (max_alloc_size == 0) ? 1 : (data_structure->size / (float)max_alloc_size);
+    if (percentage <= 1) {
+      out << '(' << (percentage * 100) << "%) ";
+    }
+
+    out << data_structure->name;
+    if (has_info) {
+      out << " \"" << data_structure->variable_name << '\"';
+    }
+    out << " uses " << to_megabytes(data_structure->size) << " mb ";
+
+    if (has_info) {
+      out << " (" << data_structure->file_name << " at line " << data_structure->line << ')';
+    }
+
+    out << '\n';
+  }
+}
+
+} // namespace kaminpar::heap_profiler
diff --git a/kaminpar-common/heap_profiler.h b/kaminpar-common/heap_profiler.h
new file mode 100644
index 00000000..cd82a732
--- /dev/null
+++ b/kaminpar-common/heap_profiler.h
@@ -0,0 +1,595 @@
+/*******************************************************************************
+ * Heap profiler to measure heap memory usage.
+ *
+ * @file:   heap_profiler.h
+ * @author: Daniel Salwasser
+ * @date:   21.10.2023
+ ******************************************************************************/
+#pragma once
+
+#include <iomanip>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+#include <cxxabi.h>
+
+#include "kaminpar-common/libc_memory_override.h"
+
+namespace kaminpar::heap_profiler {
+
+/*!
+ * Returns the (demangled) name of a type.
+ *
+ * See https://stackoverflow.com/a/25893042
+ *
+ * @tparam T The type whose name to return.
+ * @return The (demangled) name of the type T.
+ */
+template <typename T> std::string type_name() {
+  auto mangeled_name = typeid(T()).name();
+  int status = 0;
+
+  std::unique_ptr<char, void (*)(void *)> demangled_result{
+      abi::__cxa_demangle(mangeled_name, NULL, NULL, &status), std::free
+  };
+
+  // Strip the trailing brackets from the constructed function type.
+  std::string name((status == 0) ? demangled_result.get() : mangeled_name);
+  if (name.substr(name.size() - 3) == " ()") {
+    name.resize(name.size() - 3);
+  }
+
+  // Remove the namespace from the type name.
+  auto it = name.find_last_of("::");
+  if (it != std::string::npos) {
+    name = name.substr(it + 1);
+  }
+
+  // Remove the asterisk from a this pointer.
+  if (name.back() == '*') {
+    name.resize(name.size() - 1);
+  }
+
+  return name;
+}
+
+}; // namespace kaminpar::heap_profiler
+
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+
+// A macro to get the path of a source file in the project directory
+// (https://stackoverflow.com/a/40947954)
+#ifndef SOURCE_PATH_SIZE
+#define SOURCE_PATH_SIZE 0
+#endif
+
+#define __FILENAME__ ((__FILE__) + (SOURCE_PATH_SIZE))
+#define GET_MACRO(X, Y, Z, FUNC, ...) FUNC
+
+#define START_HEAP_PROFILER_2(name, desc)                                                          \
+  kaminpar::heap_profiler::HeapProfiler::global().start_profile(name, desc)
+#define START_HEAP_PROFILER_1(name) START_HEAP_PROFILER_2(name, "")
+#define START_HEAP_PROFILER(...)                                                                   \
+  GET_MACRO(_, __VA_ARGS__, START_HEAP_PROFILER_2, START_HEAP_PROFILER_1)(__VA_ARGS__)
+
+#define STOP_HEAP_PROFILER() kaminpar::heap_profiler::HeapProfiler::global().stop_profile()
+
+#define SCOPED_HEAP_PROFILER_2(name, desc, line)                                                   \
+  const auto __SCOPED_HEAP_PROFILER__##line =                                                      \
+      kaminpar::heap_profiler::HeapProfiler::global().start_scoped_profile(name, desc)
+#define SCOPED_HEAP_PROFILER_1(name, line) SCOPED_HEAP_PROFILER_2(name, "", line)
+#define SCOPED_HEAP_PROFILER(...)                                                                  \
+  GET_MACRO(_, __VA_ARGS__, SCOPED_HEAP_PROFILER_2, SCOPED_HEAP_PROFILER_1)(__VA_ARGS__, __LINE__)
+
+#define RECORD_DATA_STRUCT_2(size, variable_name)                                                  \
+  variable_name = kaminpar::heap_profiler::HeapProfiler::global().add_data_struct(                 \
+      kaminpar::heap_profiler::type_name<decltype(this)>(), size                                   \
+  )
+#define RECORD_DATA_STRUCT_1(size)                                                                 \
+  kaminpar::heap_profiler::HeapProfiler::global().add_data_struct(                                 \
+      kaminpar::heap_profiler::type_name<decltype(this)>(), size                                   \
+  )
+#define RECORD_DATA_STRUCT(...)                                                                    \
+  GET_MACRO(_, __VA_ARGS__, RECORD_DATA_STRUCT_2, RECORD_DATA_STRUCT_1)(__VA_ARGS__)
+
+#define RECORD_LOCAL_DATA_STRUCT_2(name, size, variable_name)                                      \
+  const auto variable_name =                                                                       \
+      kaminpar::heap_profiler::HeapProfiler::global().add_data_struct(name, size)
+#define RECORD_LOCAL_DATA_STRUCT_1(name, size)                                                     \
+  kaminpar::heap_profiler::HeapProfiler::global().add_data_struct(name, size)
+#define RECORD_LOCAL_DATA_STRUCT(...)                                                              \
+  GET_MACRO(__VA_ARGS__, RECORD_LOCAL_DATA_STRUCT_2, RECORD_LOCAL_DATA_STRUCT_1)(__VA_ARGS__)
+
+#define RECORD(name)                                                                               \
+  kaminpar::heap_profiler::HeapProfiler::global().record_data_struct(name, __FILENAME__, __LINE__);
+
+#define IF_HEAP_PROFILING(expression) expression
+
+#define ENABLE_HEAP_PROFILER() kaminpar::heap_profiler::HeapProfiler::global().enable()
+#define DISABLE_HEAP_PROFILER() kaminpar::heap_profiler::HeapProfiler::global().disable()
+
+#define PRINT_HEAP_PROFILE(out)                                                                    \
+  kaminpar::heap_profiler::HeapProfiler::global().print_heap_profile(out)
+
+/*!
+ * Whether heap profiling is enabled.
+ */
+constexpr bool kHeapProfiling = true;
+
+#else
+
+#define START_HEAP_PROFILER(...)
+#define STOP_HEAP_PROFILER()
+#define SCOPED_HEAP_PROFILER(...)
+#define RECORD_DATA_STRUCT(...)
+#define RECORD_LOCAL_DATA_STRUCT(...)
+#define RECORD(...)
+#define IF_HEAP_PROFILING(...)
+#define ENABLE_HEAP_PROFILER()
+#define DISABLE_HEAP_PROFILER()
+#define PRINT_HEAP_PROFILE(...)
+
+/*!
+ * Whether heap profiling is enabled.
+ */
+constexpr bool kHeapProfiling = false;
+
+#endif
+
+#ifdef KAMINPAR_ENABLE_PAGE_PROFILING
+constexpr bool kPageProfiling = true;
+#else
+constexpr bool kPageProfiling = false;
+#endif
+
+namespace kaminpar::heap_profiler {
+
+/*!
+ * A minimal allocator that uses memory allocation functions which bypass the heap profiler.
+ *
+ * This is required for allocations inside the heap profiler, otherwise a memory allocation would
+ * lead to an infinite recursion.
+ */
+template <typename T> struct NoProfilAllocator {
+  using value_type = T;
+
+  NoProfilAllocator() noexcept {}
+  template <typename U> NoProfilAllocator(const NoProfilAllocator<U> &) noexcept {}
+
+  template <typename U> bool operator==(const NoProfilAllocator<U> &) const noexcept {
+    return true;
+  }
+  template <typename U> bool operator!=(const NoProfilAllocator<U> &) const noexcept {
+    return false;
+  }
+
+  T *allocate(const size_t n) const {
+    if (n == 0) {
+      return nullptr;
+    }
+
+    if (n > static_cast<size_t>(-1) / sizeof(T)) {
+      throw std::bad_array_new_length();
+    }
+
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+    void *const ptr = std_malloc(n * sizeof(T));
+#else
+    void *const ptr = std::malloc(n * sizeof(T));
+#endif
+    if (!ptr) {
+      throw std::bad_alloc();
+    }
+
+    return static_cast<T *>(ptr);
+  }
+
+  void deallocate(T *const ptr, size_t) const noexcept {
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+    std_free(ptr);
+#else
+    std::free(ptr);
+#endif
+  }
+
+  template <typename... Args> T *create(Args &&...args) const {
+    T *t = allocate(1);
+    new (t) T(std::forward<Args>(args)...);
+    return t;
+  }
+
+  void destruct(T *const t) const {
+    t->~T();
+    deallocate(t, 1);
+  }
+};
+
+/*!
+ * Represents a data structure in the program. It contains information about a data structure that
+ * is tracked by the heap profiler.
+ */
+struct DataStructure {
+  /*!
+   * The name of the data structure.
+   */
+  std::string name;
+
+  /*!
+   * The size of the memory in bytes allocated on the heap by the data structure.
+   */
+  std::size_t size;
+
+  /*!
+   * The name of the variable storing the data structure. It is empty if it is not available.
+   */
+  std::string_view variable_name;
+  /*!
+   * The name of the source file of the variable storing the data structure. It is empty if it is
+   * not available.
+   */
+  std::string_view file_name;
+  /*!
+   * The line of the variable storing the data structure. It is zero if it is not available.
+   */
+  std::size_t line;
+
+  /*!
+   * Constructs a new data structure.
+   *
+   * @param name The name of the data structure.
+   * @param size The size of the memory in bytes allocated on the heap by the data structure.
+   */
+  explicit DataStructure(std::string name, std::size_t size)
+      : name(std::move(name)),
+        size(size),
+        variable_name(""),
+        file_name(""),
+        line(0) {}
+};
+
+class ScopedHeapProfiler;
+
+/*!
+ * A hierarchical heap profiler to measure dynamic memory allocation of the program.
+ *
+ * The memory allocation operations of libc are overridden to additionally call the global heap
+ * profiler on each allocation and deallocation request.
+ */
+class HeapProfiler {
+private:
+  static constexpr std::string_view kMaxAllocTitle = "Peak Memory (mb)";
+  static constexpr std::string_view kAllocTitle = "Total Alloc (mb)";
+  static constexpr std::string_view kFreeTitle = "Total Free (mb)";
+  static constexpr std::string_view kAllocsTitle = "Allocs";
+  static constexpr std::string_view kFreesTitle = "Frees";
+
+  static constexpr std::string_view kBranch = "|- ";
+  static constexpr std::string_view kTailBranch = "`- ";
+  static constexpr std::string_view kTailEdge = "    ";
+  static constexpr std::string_view kNameDel = ": ";
+  static constexpr char kHeadingPadding = '-';
+  static constexpr char kPadding = '.';
+
+  static constexpr std::size_t kBranchLength = 3;
+  static constexpr std::size_t kPercentageLength = 10;
+  static constexpr std::size_t kDataStructSizeThreshold = 1024;
+
+  static std::string to_megabytes(std::size_t bytes) {
+    std::stringstream stream;
+    stream << std::fixed << std::setprecision(2) << (bytes / (float)(1024 * 1024));
+    return stream.str();
+  }
+
+  struct HeapProfileTreeNode {
+    std::string_view name;
+    std::string description;
+
+    HeapProfileTreeNode *parent;
+    std::vector<HeapProfileTreeNode *, NoProfilAllocator<HeapProfileTreeNode *>> children;
+
+    std::size_t max_alloc_size;
+    std::size_t alloc_size;
+    std::size_t free_size;
+    std::size_t allocs;
+    std::size_t frees;
+
+    std::vector<DataStructure *, NoProfilAllocator<DataStructure *>> data_structures;
+
+    HeapProfileTreeNode(std::string_view name, std::string description, HeapProfileTreeNode *parent)
+        : name(name),
+          description(description),
+          parent(parent),
+          max_alloc_size(0),
+          alloc_size(0),
+          free_size(0),
+          allocs(0),
+          frees(0) {}
+
+    template <typename NodeAllocator, typename DataStructAllocator>
+    void free(NodeAllocator node_allocator, DataStructAllocator data_struct_allocator) {
+      for (DataStructure *data_structure : data_structures) {
+        data_struct_allocator.destruct(data_structure);
+      }
+
+      for (HeapProfileTreeNode *child : children) {
+        child->free(node_allocator, data_struct_allocator);
+        node_allocator.destruct(child);
+      }
+    }
+  };
+
+  struct HeapProfileTree {
+    HeapProfileTreeNode root;
+    HeapProfileTreeNode *currentNode;
+
+    HeapProfileTree(std::string_view name) : root(name, "", nullptr), currentNode(&root) {}
+  };
+
+  struct HeapProfileTreeStats {
+    std::size_t len;
+    std::size_t max_alloc_size;
+    std::size_t alloc_size;
+    std::size_t free_size;
+    std::size_t allocs;
+    std::size_t frees;
+
+    HeapProfileTreeStats(const HeapProfileTreeNode &node) {
+      std::size_t name_length = node.name.length();
+      if (!node.description.empty()) {
+        name_length += node.description.length() + 2;
+      }
+
+      len = name_length;
+      max_alloc_size = node.max_alloc_size;
+      alloc_size = node.alloc_size;
+      free_size = node.free_size;
+      allocs = node.allocs;
+      frees = node.frees;
+
+      for (auto const &child : node.children) {
+        HeapProfileTreeStats child_stats(*child);
+
+        len = std::max(len, child_stats.len + kBranchLength);
+        max_alloc_size = std::max(max_alloc_size, child_stats.max_alloc_size);
+        alloc_size = std::max(alloc_size, child_stats.alloc_size);
+        free_size = std::max(free_size, child_stats.free_size);
+        allocs = std::max(allocs, child_stats.allocs);
+        frees = std::max(frees, child_stats.frees);
+      }
+    }
+  };
+
+public:
+  /**
+   * Returns the global heap profiler.
+   *
+   * @return The global heap profiler.
+   */
+  static HeapProfiler &global();
+
+  /*!
+   * Constructs a new heap profiler.
+   *
+   * @param name The name of the heap profiler and the name of the root profile.
+   */
+  explicit HeapProfiler(std::string_view name);
+
+  /*!
+   * Destroys the heap profiler.
+   */
+  ~HeapProfiler();
+
+  /*!
+   * Starts profiling the heap.
+   */
+  void enable();
+
+  /*!
+   * Stops profiling the heap.
+   */
+  void disable();
+
+  /**
+   * Starts a new profile, adds it as a child profile to the current profile, and sets it to the
+   * current profile.
+   *
+   * @param name The name of the profile to start.
+   * @param desc The description of the profile to start.
+   */
+  void start_profile(std::string_view name, std::string desc);
+
+  /*!
+   * Stops the current profile and sets the new current profile to the parent profile.
+   */
+  void stop_profile();
+
+  /*!
+   * Starts a scoped heap profile and returns the associated object.
+   *
+   * @param name The name of the profile to start.
+   * @param desc The description of the profile to start.
+   */
+  ScopedHeapProfiler start_scoped_profile(std::string_view name, std::string desc);
+
+  /*!
+   * Records information about the variable storing the next data structure that is added to the
+   * heap profiler.
+   *
+   * @param var_name The name of the variable storing the data structure.
+   * @param file_name The name of the source file of the variable storing the data structure.
+   * @param line The line of the variable storing the data structure.
+   */
+  void record_data_struct(std::string_view var_name, std::string_view file_name, std::size_t line);
+
+  /*!
+   * Adds a data structure to track to the current profile of the heap profiler. If information
+   * about the variable that stores the data structure has been recorded by the heap profiler, it is
+   * added.
+   *
+   * @param name The name of the data structure.
+   * @param size The size of the memory in bytes allocated on the heap by the data structure.
+   * @return A pointer to the object holding information about the data structure or a nullptr if
+   * the heap profiler is disabled.
+   */
+  DataStructure *add_data_struct(std::string name, std::size_t size);
+
+  /*!
+   * Records a memory allocation.
+   *
+   * @param ptr The pointer to the beginning of the allocated memory.
+   * @param size The number allocated bytes.
+   */
+  void record_alloc(const void *ptr, std::size_t size);
+
+  /*!
+   * Records a memory deallocation.
+   *
+   * @param ptr The pointer to the beginning of the allocated memory
+   */
+  void record_free(const void *ptr);
+
+  /*!
+   * Sets the options such that the printed summary contains detailed information.
+   */
+  void set_detailed_summary_options();
+
+  /*!
+   * Sets the maximum depth shown in the summary.
+   *
+   * @param max_depth The maximum depth shown in the summary.
+   */
+  void set_max_depth(std::size_t max_depth);
+
+  /*!
+   * Sets the option whether to print data structure memory statistics in the summary.
+   *
+   * @param print Whether to print data structure memory statistics in the summary.
+   */
+  void set_print_data_structs(bool print);
+
+  /*!
+   * Sets the minimum size of a data structure in MB to be included in the summary.
+   *
+   * @param size The minimum size of a data structure in MB to be included in the summary.
+   */
+  void set_min_data_struct_size(float size);
+
+  /*!
+   * Prints information about the heap profile to the output stream.
+   *
+   * @param out The output stream to write to.
+   */
+  void print_heap_profile(std::ostream &out);
+
+  /*!
+   * Returns the amount of maximum allocated memory in bytes of the current heap profile.
+   *
+   * @return The amount of maximum allocated memory in bytes of the current heap profile.
+   */
+  std::size_t get_max_alloc();
+
+  /*!
+   * Returns the amount of allocated memory in bytes of the current heap profile.
+   *
+   * @return The amount of allocated memory in bytes of the current heap profile.
+   */
+  std::size_t get_alloc();
+
+  /*!
+   * Returns the amount of freed memory in bytes of the current heap profile.
+   *
+   * @return The amount of freed memory in bytes of the current heap profile.
+   */
+  std::size_t get_free();
+
+  /*!
+   * Returns the amount of alloc operations of the current heap profile.
+   *
+   * @return The amount of alloc operations of the current heap profile.
+   */
+  std::size_t get_allocs();
+
+  /*!
+   * Returns the amount of free operations of the current heap profile.
+   *
+   * @return The amount of free operations of the current heap profile.
+   */
+  std::size_t get_frees();
+
+private:
+  bool _enabled = false;
+  std::mutex _mutex;
+
+  NoProfilAllocator<HeapProfileTreeNode> _node_allocator;
+  HeapProfileTree _tree;
+  std::unordered_map<
+      const void *,
+      std::size_t,
+      std::hash<const void *>,
+      std::equal_to<const void *>,
+      NoProfilAllocator<std::pair<const void *const, std::size_t>>>
+      _address_map;
+
+  NoProfilAllocator<DataStructure> _struct_allocator;
+  std::string_view _var_name;
+  std::string_view _file_name;
+  std::size_t _line;
+
+  std::size_t _max_depth = std::numeric_limits<std::size_t>::max();
+  bool _print_data_structs = true;
+  std::size_t _min_data_struct_size = 0;
+
+  static void print_heap_tree_node(
+      std::ostream &out,
+      const HeapProfileTreeNode &node,
+      const HeapProfileTreeStats stats,
+      std::size_t max_depth,
+      bool print_data_structs,
+      std::size_t min_data_struct_size,
+      std::size_t depth = 0,
+      bool last = false
+  );
+  static void print_indentation(std::ostream &out, std::size_t depth, bool last);
+  static void print_percentage(std::ostream &out, const HeapProfileTreeNode &node);
+  static void print_statistics(
+      std::ostream &out, const HeapProfileTreeNode &node, const HeapProfileTreeStats stats
+  );
+  static void print_data_structures(
+      std::ostream &out,
+      const HeapProfileTreeNode &node,
+      std::size_t depth,
+      bool last,
+      std::size_t min_data_struct_size
+  );
+};
+
+/*!
+ * A helper class for scoped heap profiling. The profile starts with the construction of the object
+ * and ends with the destruction of the object.
+ */
+class ScopedHeapProfiler {
+public:
+  /*!
+   * Constructs a new scoped heap profiler and thereby starting a new heap profile.
+   *
+   * @param name The name of the started profile.
+   * @param description The description of the started profile.
+   */
+  ScopedHeapProfiler(std::string_view name, std::string description) {
+    HeapProfiler::global().start_profile(name, description);
+  }
+
+  /*!
+   * Deconstructs the scoped heap profiler and thereby stopping the heap profile.
+   */
+  inline ~ScopedHeapProfiler() {
+    HeapProfiler::global().stop_profile();
+  }
+};
+
+} // namespace kaminpar::heap_profiler
diff --git a/kaminpar-common/libc_memory_override.cc b/kaminpar-common/libc_memory_override.cc
new file mode 100644
index 00000000..12508aaa
--- /dev/null
+++ b/kaminpar-common/libc_memory_override.cc
@@ -0,0 +1,126 @@
+/*******************************************************************************
+ * This file overwrites the memory allocation operations of libc with operations that additionally
+ * invoke the heap profiler.
+ *
+ * @file:   libc_memory_override.cc
+ * @author: Daniel Salwasser
+ * @date:   22.10.2023
+ ******************************************************************************/
+#include "kaminpar-common/libc_memory_override.h"
+
+#include <cstdlib>
+
+#include "kaminpar-common/heap_profiler.h"
+
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+#ifdef __GLIBC__
+extern "C" {
+
+using kaminpar::heap_profiler::HeapProfiler;
+
+extern void *__libc_malloc(size_t);
+extern void *__libc_calloc(size_t, size_t);
+extern void *__libc_realloc(void *, size_t);
+extern void *__libc_free(void *);
+extern void *__libc_memalign(size_t, size_t);
+extern void *__libc_valloc(size_t);
+extern void *__libc_pvalloc(size_t);
+extern void *__libc_realloc(void *, size_t);
+
+void *malloc(size_t size) {
+  void *ptr = __libc_malloc(size);
+  HeapProfiler::global().record_alloc(ptr, size);
+  return ptr;
+};
+
+void *calloc(size_t size, size_t n) {
+  void *ptr = __libc_calloc(size, n);
+  HeapProfiler::global().record_alloc(ptr, size * n);
+  return ptr;
+}
+
+void *realloc(void *p, size_t newsize) {
+  void *ptr = __libc_realloc(p, newsize);
+  HeapProfiler::global().record_free(p);
+  HeapProfiler::global().record_alloc(ptr, newsize);
+  return ptr;
+}
+
+void free(void *p) {
+  __libc_free(p);
+  HeapProfiler::global().record_free(p);
+}
+
+void *aligned_alloc(size_t alignment, size_t size) {
+  // Since glibc does not define aligned_alloc as a weak symbol to e.g. __libc_aligned_alloc, unlike
+  // other functions, __libc_memalign is called instead with a check for valid alignment.
+  bool is_power_of_2 = (alignment & (alignment - 1)) == 0;
+  if (!is_power_of_2 || alignment == 0) {
+    errno = EINVAL;
+    return 0;
+  }
+
+  void *ptr = __libc_memalign(alignment, size);
+  HeapProfiler::global().record_alloc(ptr, size);
+  return ptr;
+}
+
+void *memalign(size_t alignment, size_t size) {
+  void *ptr = __libc_memalign(alignment, size);
+  HeapProfiler::global().record_alloc(ptr, size);
+  return ptr;
+}
+
+void *valloc(size_t size) {
+  void *ptr = __libc_valloc(size);
+  HeapProfiler::global().record_alloc(ptr, size);
+  return ptr;
+}
+
+void *pvalloc(size_t size) {
+  void *ptr = __libc_pvalloc(size);
+  HeapProfiler::global().record_alloc(ptr, size);
+  return ptr;
+}
+
+#ifdef KAMINPAR_ENABLE_PAGE_PROFILING
+extern void *__mmap(void *, size_t, int, int, int, off_t);
+extern int __munmap(void *, size_t);
+
+void *mmap(void *addr, size_t len, int prot, int flags, int fd, __off_t offset) {
+  void *ptr = __mmap(addr, len, prot, flags, fd, offset);
+  HeapProfiler::global().record_alloc(addr, len);
+  return ptr;
+}
+
+int munmap(void *addr, size_t len) {
+  int return_value = __munmap(addr, len);
+  HeapProfiler::global().record_free(addr);
+  return return_value;
+}
+#endif
+}
+#else
+#error Heap profiling is only supported for systems that are using glibc.
+#endif
+#endif
+
+namespace kaminpar::heap_profiler {
+
+void *std_malloc(std::size_t size) {
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+  return __libc_malloc(size);
+#else
+  return std::malloc(size);
+#endif
+}
+
+void std_free(void *ptr) {
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+  __libc_free(ptr);
+#else
+  std::free(ptr);
+#endif
+}
+
+} // namespace kaminpar::heap_profiler
diff --git a/kaminpar-common/libc_memory_override.h b/kaminpar-common/libc_memory_override.h
new file mode 100644
index 00000000..1d69a4fe
--- /dev/null
+++ b/kaminpar-common/libc_memory_override.h
@@ -0,0 +1,33 @@
+/*******************************************************************************
+ * This file overwrites the memory allocation operations of libc with operations that additionally
+ * invoke the heap profiler.
+ *
+ * @file:   libc_memory_override.h
+ * @author: Daniel Salwasser
+ * @date:   22.10.2023
+ ******************************************************************************/
+#pragma once
+
+#include <cstddef>
+
+namespace kaminpar::heap_profiler {
+
+/*!
+ * Allocates size bytes of uninitialized memory. The allocation request is directly forwarded to
+ * malloc and thus not captured by the heap profiler.
+ *
+ * @param size The number of bytes to allocate.
+ *
+ * @return Returns the pointer to the beginning of newly allocated memory on success, otherwise a
+ * null pointer.
+ */
+void *std_malloc(std::size_t size);
+
+/*!
+ * Deallocates the memory previously allocated by std_malloc.
+ *
+ * @param ptr The pointer to the memory to be deallocated.
+ */
+void std_free(void *ptr);
+
+} // namespace kaminpar::heap_profiler
diff --git a/kaminpar-common/math.h b/kaminpar-common/math.h
index 45b76b16..461f7273 100644
--- a/kaminpar-common/math.h
+++ b/kaminpar-common/math.h
@@ -17,6 +17,44 @@
 #include "kaminpar-common/assert.h"
 
 namespace kaminpar::math {
+
+/*!
+ * Divides two integers with ceil rounding.
+ *
+ * @param x The dividend which has to be non-zero.
+ * @param y The divisor.
+ * @return The ceiling of x divided by y.
+ */
+template <typename Int> constexpr std::size_t abs(Int value) {
+  if (value < 0) {
+    value *= -1;
+  }
+
+  return static_cast<std::size_t>(value);
+}
+
+/*!
+ * Returns the absolute difference between two (possibly unsigned) integers.
+ *
+ * @param x The first integer.
+ * @param y The second integer.
+ * @return The absolute difference of x and y.
+ */
+template <typename Int1, typename Int2> constexpr std::size_t abs_diff(const Int1 x, const Int2 y) {
+  return x > y ? x - y : y - x;
+}
+
+/*!
+ * Divides two integers with ceil rounding.
+ *
+ * @param x The dividend which has to be non-zero.
+ * @param y The divisor.
+ * @return The ceiling of x divided by y.
+ */
+template <typename Int1, typename Int2> constexpr Int1 div_ceil(const Int1 x, const Int2 y) {
+  return 1 + ((x - 1) / y);
+}
+
 template <typename Int> bool is_square(const Int value) {
   const Int sqrt = std::sqrt(value);
   return sqrt * sqrt == value;
@@ -58,6 +96,15 @@ template <typename T> T ceil2(const T arg) {
   return 1 << ceil_log2(arg);
 }
 
+template <typename Int> constexpr Int byte_width(const Int i) {
+  if (i == 0) {
+    return 1;
+  }
+
+  const Int bit_width = 1 + floor_log2(i);
+  return div_ceil<Int>(bit_width, 8);
+}
+
 template <typename E>
 double percentile(const std::vector<E> &sorted_sequence, const double percentile) {
   KASSERT([&] {
@@ -75,7 +122,8 @@ double percentile(const std::vector<E> &sorted_sequence, const double percentile
 
 template <typename T> auto split_integral(const T value, const double ratio = 0.5) {
   return std::pair{
-      static_cast<T>(std::ceil(value * ratio)), static_cast<T>(std::floor(value * (1.0 - ratio)))};
+      static_cast<T>(std::ceil(value * ratio)), static_cast<T>(std::floor(value * (1.0 - ratio)))
+  };
 }
 
 /**
diff --git a/kaminpar-common/parallel/aligned_element.h b/kaminpar-common/parallel/aligned_element.h
index 828965de..bf1d4bea 100644
--- a/kaminpar-common/parallel/aligned_element.h
+++ b/kaminpar-common/parallel/aligned_element.h
@@ -9,6 +9,7 @@
 #include <type_traits>
 
 namespace kaminpar::parallel {
+
 template <typename Value> struct alignas(64) Aligned {
   Value value;
 
@@ -33,4 +34,44 @@ template <typename Value> struct alignas(64) Aligned {
     return value != other;
   }
 };
+
+template <typename Vector> struct alignas(64) AlignedVec {
+  Vector vec;
+
+  AlignedVec() : vec() {}
+  AlignedVec(Vector vec) : vec(vec) {}
+
+  decltype(auto) operator[](std::size_t pos) {
+    return vec[pos];
+  }
+
+  decltype(auto) operator[](std::size_t pos) const {
+    return vec[pos];
+  }
+
+  decltype(auto) begin() noexcept {
+    return vec.begin();
+  }
+
+  decltype(auto) begin() const noexcept {
+    return vec.begin();
+  }
+
+  decltype(auto) end() noexcept {
+    return vec.end();
+  }
+
+  decltype(auto) end() const noexcept {
+    return vec.end();
+  }
+
+  void clear() noexcept {
+    vec.clear();
+  }
+
+  void resize(std::size_t count) {
+    vec.resize(count);
+  }
+};
+
 } // namespace kaminpar::parallel
diff --git a/kaminpar-common/parallel/tbb_malloc.h b/kaminpar-common/parallel/tbb_malloc.h
index 9edf1ca0..421b6052 100644
--- a/kaminpar-common/parallel/tbb_malloc.h
+++ b/kaminpar-common/parallel/tbb_malloc.h
@@ -11,30 +11,34 @@
 #include <tbb/scalable_allocator.h>
 
 #include "kaminpar-common/assert.h"
+#include "kaminpar-common/heap_profiler.h"
 
 namespace kaminpar::parallel {
 template <typename T> struct tbb_deleter {
   void operator()(T *p) {
     scalable_free(p);
+
+    if constexpr (kHeapProfiling && !kPageProfiling) {
+      heap_profiler::HeapProfiler::global().record_free(p);
+    }
   }
 };
 
 template <typename T> using tbb_unique_ptr = std::unique_ptr<T, tbb_deleter<T>>;
+// template <typename T> using tbb_unique_ptr = std::unique_ptr<T>;
 
 template <typename T> tbb_unique_ptr<T> make_unique(const std::size_t size) {
   auto nbytes = sizeof(T) * size;
   T *ptr = static_cast<T *>(scalable_malloc(nbytes));
+
   KASSERT(
-      ptr != nullptr,
-      "probably out of memory after attemping to allocate " << nbytes << " bytes",
-      assert::light
+      ptr != nullptr, "out of memory: could not allocate " << nbytes << " bytes", assert::light
   );
-  return tbb_unique_ptr<T>(ptr, tbb_deleter<T>{});
-}
 
-template <typename T, typename... Args> tbb_unique_ptr<T> make_unique(Args &&...args) {
-  void *memory = static_cast<T *>(scalable_malloc(sizeof(T)));
-  T *ptr = new (memory) T(std::forward<Args...>(args)...);
+  if constexpr (kHeapProfiling && !kPageProfiling) {
+    heap_profiler::HeapProfiler::global().record_alloc(ptr, sizeof(T) * size);
+  }
+
   return tbb_unique_ptr<T>(ptr, tbb_deleter<T>{});
 }
 } // namespace kaminpar::parallel
diff --git a/kaminpar-common/ranges.h b/kaminpar-common/ranges.h
index 4bcfa5b3..e69799a3 100644
--- a/kaminpar-common/ranges.h
+++ b/kaminpar-common/ranges.h
@@ -7,6 +7,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <functional>
 #include <iterator>
 #include <type_traits>
 
diff --git a/kaminpar-common/varint_codec.cc b/kaminpar-common/varint_codec.cc
new file mode 100644
index 00000000..d2bfed3c
--- /dev/null
+++ b/kaminpar-common/varint_codec.cc
@@ -0,0 +1,32 @@
+/*******************************************************************************
+ * Encoding and decoding methods for VarInts.
+ *
+ * @file:   varint_codec.cc
+ * @author: Daniel Salwasser
+ * @date:   26.12.2023
+ ******************************************************************************/
+#include "kaminpar-common/varint_codec.h"
+
+namespace kaminpar {
+
+namespace debug {
+
+static VarIntStats stats = {0, 0, 0, 0, 0, 0};
+
+void varint_stats_reset() {
+  stats.varint_count = 0;
+  stats.signed_varint_count = 0;
+  stats.marked_varint_count = 0;
+
+  stats.varint_bytes = 0;
+  stats.signed_varint_bytes = 0;
+  stats.marked_varint_bytes = 0;
+}
+
+VarIntStats &varint_stats_global() {
+  return stats;
+}
+
+} // namespace debug
+
+} // namespace kaminpar
diff --git a/kaminpar-common/varint_codec.h b/kaminpar-common/varint_codec.h
new file mode 100644
index 00000000..5ee0158e
--- /dev/null
+++ b/kaminpar-common/varint_codec.h
@@ -0,0 +1,556 @@
+/*******************************************************************************
+ * Encoding and decoding methods for VarInts.
+ *
+ * @file:   varint_codec.h
+ * @author: Daniel Salwasser
+ * @date:   11.11.2023
+ ******************************************************************************/
+#pragma once
+
+#include <cstdint>
+#include <tuple>
+#include <utility>
+
+#include <immintrin.h>
+
+namespace kaminpar {
+
+namespace debug {
+
+/*!
+ * Whether to track statistics on encoded VarInts.
+ */
+static constexpr bool kTrackVarintStats = false;
+
+/*!
+ * Statistics about encoded VarInts.
+ */
+struct VarIntStats {
+  std::size_t varint_count;
+  std::size_t signed_varint_count;
+  std::size_t marked_varint_count;
+
+  std::size_t varint_bytes;
+  std::size_t signed_varint_bytes;
+  std::size_t marked_varint_bytes;
+};
+
+/*!
+ * Reset the global statistics on encoded VarInts.
+ */
+void varint_stats_reset();
+
+/*!
+ * Returns a reference to the global statistics on encoded VarInts.
+ *
+ * @return A reference to the global statistics on encoded VarInts.
+ */
+VarIntStats &varint_stats_global();
+
+} // namespace debug
+
+/*!
+ * Encodes a signed integer using zigzag encoding.
+ *
+ * @param i The signed integer to encode.
+ * @return The encoded integer.
+ */
+template <typename Int> [[nodiscard]] std::make_unsigned_t<Int> zigzag_encode(Int i) {
+  return (i >> (sizeof(Int) * 8 - 1)) ^ (i << 1);
+}
+
+/*!
+ * Decodes a zigzag encoded integer.
+ *
+ * @param i The zigzag encoded integer to decode.
+ * @return The decoded integer.
+ */
+template <typename Int> [[nodiscard]] std::make_signed_t<Int> zigzag_decode(Int i) {
+  return (i >> 1) ^ -(i & 1);
+}
+
+/*!
+ * Returns the maximum number of bytes that a VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded maximum length is returned.
+ */
+template <typename Int> [[nodiscard]] constexpr std::size_t varint_max_length() {
+  return (sizeof(Int) * 8) / 7 + 1;
+}
+
+/*!
+ * Returns the number of bytes a VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded length is returned.
+ * @param Int The integer to store.
+ * @return The number of bytes the integer needs to be stored.
+ */
+template <typename Int> [[nodiscard]] std::size_t varint_length(Int i) {
+  std::size_t len = 1;
+
+  while (i > 0b01111111) {
+    i >>= 7;
+    len++;
+  }
+
+  return len;
+}
+
+/*!
+ * Returns the number of bytes a signed VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded length is returned.
+ * @param Int The integer to store.
+ * @return The number of bytes the integer needs to be stored.
+ */
+template <typename Int> [[nodiscard]] std::size_t signed_varint_length(Int i) {
+  return varint_length(zigzag_encode(i));
+}
+
+/*!
+ * Returns the number of bytes a marked VarInt needs to be stored.
+ *
+ * @tparam Int The type of integer whose encoded length is returned.
+ * @param Int The integer to store.
+ * @return The number of bytes the integer needs to be stored.
+ */
+template <typename Int> [[nodiscard]] std::size_t marked_varint_length(Int i) {
+  std::size_t len = 1;
+
+  i >>= 6;
+  if (i > 0) {
+    len += varint_length(i);
+  }
+
+  return len;
+}
+
+/*!
+ * Writes an integer to a memory location as a VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param ptr The pointer to the memory location to write the integer to.
+ * @return The number of bytes that the integer occupies at the memory location.
+ */
+template <typename Int> std::size_t varint_encode(Int i, std::uint8_t *ptr) {
+  std::size_t len = 1;
+
+  while (i > 0b01111111) {
+    std::uint8_t octet = (i & 0b01111111) | 0b10000000;
+    *ptr = octet;
+
+    i >>= 7;
+    ptr++;
+    len++;
+  }
+
+  std::uint8_t last_octet = i & 0b01111111;
+  *ptr = last_octet;
+
+  if (debug::kTrackVarintStats) {
+    debug::varint_stats_global().varint_count++;
+    debug::varint_stats_global().varint_bytes += len;
+  }
+
+  return len;
+}
+
+/*!
+ * Writes an integer to a memory location as a signed VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param ptr The pointer to the memory location to write the integer to.
+ * @return The number of bytes that the integer occupies at the memory location.
+ */
+template <typename Int> std::size_t signed_varint_encode(Int i, std::uint8_t *ptr) {
+  const std::size_t len = varint_encode(zigzag_encode(i), ptr);
+
+  if (debug::kTrackVarintStats) {
+    debug::varint_stats_global().signed_varint_count++;
+    debug::varint_stats_global().signed_varint_bytes += len;
+  }
+
+  return len;
+}
+
+/*!
+ * Writes an integer to a memory location as a marked VarInt.
+ *
+ * @tparam Int The type of integer to encode.
+ * @param Int The integer to store.
+ * @param marker_set Whether the integer is marked.
+ * @param ptr The pointer to the memory location to write the integer to.
+ * @return The number of bytes that the integer occupies at the memory location.
+ */
+template <typename Int>
+std::size_t marked_varint_encode(Int i, bool marker_set, std::uint8_t *ptr) {
+  std::uint8_t first_octet;
+
+  if (marker_set) {
+    first_octet = (i & 0b00111111) | 0b01000000;
+  } else {
+    first_octet = (i & 0b00111111);
+  }
+
+  i >>= 6;
+
+  if (i > 0) {
+    first_octet |= 0b10000000;
+    *ptr = first_octet;
+
+    std::size_t len = varint_encode<Int>(i, ptr + 1) + 1;
+
+    if (debug::kTrackVarintStats) {
+      debug::varint_stats_global().marked_varint_count++;
+      debug::varint_stats_global().marked_varint_bytes += len;
+    }
+
+    return len;
+  }
+
+  if (debug::kTrackVarintStats) {
+    debug::varint_stats_global().marked_varint_count++;
+    debug::varint_stats_global().marked_varint_bytes++;
+  }
+
+  *ptr = first_octet;
+  return 1;
+}
+
+/*!
+ * Reads an integer encoded as a VarInt from a memory location. The decoding is implemented as a
+ * loop with non intrinsic operations.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
+ * occupied at the memory location.
+ */
+template <typename Int>
+[[nodiscard]] std::pair<Int, std::size_t> varint_decode_general(const std::uint8_t *ptr) {
+  Int result = 0;
+  std::size_t shift = 0;
+  std::size_t position = 0;
+
+  while (true) {
+    const std::uint8_t byte = ptr[position++];
+
+    if ((byte & 0b10000000) == 0) {
+      result |= static_cast<Int>(byte) << shift;
+      break;
+    } else {
+      result |= static_cast<Int>(byte & 0b01111111) << shift;
+    }
+
+    shift += 7;
+  }
+
+  return std::make_pair(result, position);
+}
+
+/*!
+ * Reads an integer encoded as a VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
+ * occupied at the memory location.
+ */
+template <typename Int>
+[[nodiscard]] std::pair<Int, std::size_t> varint_decode(const std::uint8_t *ptr) {
+  return varint_decode_general<Int>(ptr);
+}
+
+#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
+/*!
+ * Reads a 32-bit integer encoded as a VarInt from a memory location. The decoding is implemented
+ * as an unrolled loop with intrinsic operations.
+ *
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
+ * occupied at the memory location.
+ */
+template <>
+inline std::pair<std::uint32_t, std::size_t> varint_decode<std::uint32_t>(const std::uint8_t *ptr) {
+  if ((ptr[0] & 0b10000000) == 0) {
+    const std::uint32_t result = *ptr & 0b01111111;
+    return std::make_pair(result, 1);
+  }
+
+  if ((ptr[1] & 0b10000000) == 0) {
+    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F);
+    return std::make_pair(result, 2);
+  }
+
+  if ((ptr[2] & 0b10000000) == 0) {
+    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F);
+    return std::make_pair(result, 3);
+  }
+
+  if ((ptr[3] & 0b10000000) == 0) {
+    const std::uint32_t result =
+        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F7F);
+    return std::make_pair(result, 4);
+  }
+
+  const std::uint32_t result = static_cast<std::uint32_t>(
+      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F)
+  );
+  return std::make_pair(result, 5);
+}
+
+/*!
+ * Reads a 64-bit integer encoded as a VarInt from a memory location. The decoding is implemented
+ * as an unrolled loop with intrinsic operations.
+ *
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
+ * occupied at the memory location.
+ */
+template <>
+inline std::pair<std::uint64_t, std::size_t> varint_decode<std::uint64_t>(const std::uint8_t *ptr) {
+  if ((ptr[0] & 0b10000000) == 0) {
+    const std::uint64_t result = *ptr & 0b01111111;
+    return std::make_pair(result, 1);
+  }
+
+  if ((ptr[1] & 0b10000000) == 0) {
+    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F);
+    return std::make_pair(result, 2);
+  }
+
+  if ((ptr[2] & 0b10000000) == 0) {
+    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F);
+    return std::make_pair(result, 3);
+  }
+
+  if ((ptr[3] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F7F);
+    return std::make_pair(result, 4);
+  }
+
+  if ((ptr[4] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F);
+    return std::make_pair(result, 5);
+  }
+
+  if ((ptr[5] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F);
+    return std::make_pair(result, 6);
+  }
+
+  if ((ptr[6] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F);
+    return std::make_pair(result, 7);
+  }
+
+  if ((ptr[7] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F);
+    return std::make_pair(result, 8);
+  }
+
+  if ((ptr[8] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F) |
+        (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 56);
+    return std::make_pair(result, 9);
+  }
+
+  const std::uint64_t result =
+      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F7F) |
+      (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 56) |
+      (static_cast<std::uint64_t>(ptr[9]) << 63);
+  return std::make_pair(result, 10);
+}
+#endif
+
+/*!
+ * Reads an integer encoded as a signed VarInt from a memory location. The decoding is implemented
+ * as a loop with non intrinsic operations.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
+ * occupied at the memory location.
+ */
+template <typename Int>
+[[nodiscard]] std::pair<Int, std::size_t> signed_varint_decode_general(const std::uint8_t *ptr) {
+  const auto [unsigned_value, len] = varint_decode_general<std::make_unsigned_t<Int>>(ptr);
+  return std::make_pair(zigzag_decode(unsigned_value), len);
+}
+
+/*!
+ * Reads an integer encoded as a signed VarInt from a memory location.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A pair consisting of the decoded integer and the number of bytes that the encoded integer
+ * occupied at the memory location.
+ */
+template <typename Int>
+[[nodiscard]] std::pair<Int, std::size_t> signed_varint_decode(const std::uint8_t *ptr) {
+  const auto [unsigned_value, len] = varint_decode<std::make_unsigned_t<Int>>(ptr);
+  return std::make_pair(zigzag_decode(unsigned_value), len);
+}
+
+/*!
+ * Reads an integer encoded as a marked VarInt from a memory location. The decoding is implemented
+ * as a loop with non intrinsic operations.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A tuple consisting of the decoded integer, whether the markes is set and the number of
+ * bytes that the encoded integer occupied at the memory location.
+ */
+template <typename Int>
+[[nodiscard]] std::tuple<Int, bool, std::size_t> marked_varint_decode(const std::uint8_t *ptr) {
+  const std::uint8_t first_byte = *ptr;
+  const bool is_continuation_bit_set = (first_byte & 0b10000000) != 0;
+  const bool is_marker_set = (first_byte & 0b01000000) != 0;
+
+  Int result = first_byte & 0b00111111;
+  std::size_t shift = 0;
+  std::size_t position = 1;
+
+  if (is_continuation_bit_set) {
+    while (true) {
+      const std::uint8_t byte = ptr[position++];
+
+      if ((byte & 0b10000000) == 0) {
+        result |= static_cast<Int>(byte) << (shift + 6);
+        break;
+      } else {
+        result |= static_cast<Int>(byte & 0b01111111) << (shift + 6);
+      }
+
+      shift += 7;
+    }
+  }
+
+  return std::make_tuple(result, is_marker_set, position);
+}
+
+#ifdef KAMINPAR_COMPRESSION_FAST_DECODING
+/*!
+ * Reads a 32-bit integer encoded as a marked VarInt from a memory location. The decoding is
+ * implemented as an unrolled loop with intrinsic operations.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A tuple consisting of the decoded integer, whether the markes is set and the number of
+ * bytes that the encoded integer occupied at the memory location.
+ */
+template <>
+inline std::tuple<std::uint32_t, bool, std::size_t>
+marked_varint_decode<std::uint32_t>(const std::uint8_t *ptr) {
+  const bool is_marker_set = (*ptr & 0b01000000) != 0;
+
+  if ((ptr[0] & 0b10000000) == 0) {
+    const std::uint32_t result = *ptr & 0b00111111;
+    return std::make_tuple(result, is_marker_set, 1);
+  }
+
+  if ((ptr[1] & 0b10000000) == 0) {
+    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F3F);
+    return std::make_tuple(result, is_marker_set, 2);
+  }
+
+  if ((ptr[2] & 0b10000000) == 0) {
+    const std::uint32_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F3F);
+    return std::make_tuple(result, is_marker_set, 3);
+  }
+
+  if ((ptr[3] & 0b10000000) == 0) {
+    const std::uint32_t result =
+        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F3F);
+    return std::make_tuple(result, is_marker_set, 4);
+  }
+
+  const std::uint32_t result = static_cast<std::uint32_t>(
+      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F3F)
+  );
+  return std::make_tuple(result, is_marker_set, 5);
+}
+
+/*!
+ * Reads a 64-bit integer encoded as a marked VarInt from a memory location. The decoding is
+ * implemented as an unrolled loop with intrinsic operations.
+ *
+ * @tparam Int The type of integer to decode.
+ * @param ptr The pointer to the memory location to read the integer from.
+ * @return A tuple consisting of the decoded integer, whether the markes is set and the number of
+ * bytes that the encoded integer occupied at the memory location.
+ */
+template <>
+inline std::tuple<std::uint64_t, bool, std::size_t>
+marked_varint_decode<std::uint64_t>(const std::uint8_t *ptr) {
+  const bool is_marker_set = (*ptr & 0b01000000) != 0;
+
+  if ((ptr[0] & 0b10000000) == 0) {
+    const std::uint64_t result = *ptr & 0b00111111;
+    return std::make_tuple(result, is_marker_set, 1);
+  }
+
+  if ((ptr[1] & 0b10000000) == 0) {
+    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F3F);
+    return std::make_tuple(result, is_marker_set, 2);
+  }
+
+  if ((ptr[2] & 0b10000000) == 0) {
+    const std::uint64_t result = _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F3F);
+    return std::make_tuple(result, is_marker_set, 3);
+  }
+
+  if ((ptr[3] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u32(*reinterpret_cast<const std::uint32_t *>(ptr), 0x7F7F7F3F);
+    return std::make_tuple(result, is_marker_set, 4);
+  }
+
+  if ((ptr[4] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F3F);
+    return std::make_tuple(result, is_marker_set, 5);
+  }
+
+  if ((ptr[5] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F3F);
+    return std::make_tuple(result, is_marker_set, 6);
+  }
+
+  if ((ptr[6] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F3F);
+    return std::make_tuple(result, is_marker_set, 7);
+  }
+
+  if ((ptr[7] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F3F);
+    return std::make_tuple(result, is_marker_set, 8);
+  }
+
+  if ((ptr[8] & 0b10000000) == 0) {
+    const std::uint64_t result =
+        _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F3F) |
+        (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 55);
+    return std::make_tuple(result, is_marker_set, 9);
+  }
+
+  const std::uint64_t result =
+      _pext_u64(*reinterpret_cast<const std::uint64_t *>(ptr), 0x7F7F7F7F7F7F7F3F) |
+      (static_cast<std::uint64_t>(ptr[8] & 0b01111111) << 55) |
+      (static_cast<std::uint64_t>(ptr[9]) << 62);
+  return std::make_tuple(result, is_marker_set, 10);
+}
+#endif
+
+} // namespace kaminpar
diff --git a/kaminpar-common/varint_run_length_codec.h b/kaminpar-common/varint_run_length_codec.h
new file mode 100644
index 00000000..6120bfb8
--- /dev/null
+++ b/kaminpar-common/varint_run_length_codec.h
@@ -0,0 +1,380 @@
+/*******************************************************************************
+ * Encoding and decoding methods for run-length VarInts.
+ *
+ * @file:   varint_run_length_codec.h
+ * @author: Daniel Salwasser
+ * @date:   29.12.2023
+ ******************************************************************************/
+#pragma once
+
+#include <cstdint>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace kaminpar {
+
+/*!
+ * An encoder for writing run-length VarInts.
+ *
+ * @tparam Int The type of integer to encode.
+ */
+template <typename Int> class VarIntRunLengthEncoder {
+  static_assert(sizeof(Int) == 4 || sizeof(Int) == 8);
+
+public:
+  static constexpr std::size_t kBufferSize = (sizeof(Int) == 4) ? 64 : 32;
+
+  /*!
+   * Constructs a new VarIntRunLengthEncoder.
+   *
+   * @param ptr The pointer to the memory location where the encoded integers are written.
+   */
+  VarIntRunLengthEncoder(std::uint8_t *ptr) : _ptr(ptr) {}
+
+  /*!
+   * Encodes an integer.
+   *
+   * @param i The integer to encode.
+   * @return The number of bytes that the integer requires to be stored in encoded format. It
+   * includes the control byte if it is the first integer of a block.
+   */
+  std::size_t add(Int i) {
+    std::uint8_t size = needed_bytes(i);
+
+    if (_buffer.empty()) {
+      _buffered_size = size++;
+    } else if (_buffer.size() == kBufferSize || _buffered_size != size) {
+      flush();
+      _buffered_size = size++;
+    }
+
+    _buffer.push_back(i);
+    return size;
+  }
+
+  /*!
+   * Writes the remaining integers added to the encoder which do not form a complete block to
+   * memory.
+   */
+  void flush() {
+    if (_buffer.empty()) {
+      return;
+    }
+
+    const std::uint8_t *begin = _ptr;
+    if constexpr (sizeof(Int) == 4) {
+      const std::uint8_t header = (static_cast<std::uint8_t>(_buffer.size() - 1) << 2) |
+                                  ((_buffered_size - 1) & 0b00000011);
+      *_ptr++ = header;
+    } else if constexpr (sizeof(Int) == 8) {
+      const std::uint8_t header = (static_cast<std::uint8_t>(_buffer.size() - 1) << 3) |
+                                  ((_buffered_size - 1) & 0b00000111);
+      *_ptr++ = header;
+    }
+
+    for (Int value : _buffer) {
+      for (std::uint8_t i = 0; i < _buffered_size; ++i) {
+        *_ptr++ = static_cast<std::uint8_t>(value);
+        value >>= 8;
+      }
+    }
+
+    _buffer.clear();
+  }
+
+private:
+  std::uint8_t *_ptr;
+
+  std::uint8_t _buffered_size;
+  std::vector<Int> _buffer;
+
+  std::uint8_t needed_bytes(Int i) const {
+    std::size_t len = 1;
+
+    while (i > 0b11111111) {
+      i >>= 8;
+      len++;
+    }
+
+    return len;
+  }
+};
+
+/*!
+ * A decoder for reading run-length VarInts.
+ *
+ * @tparam Int The type of integer to decode.
+ */
+template <typename Int> class VarIntRunLengthDecoder {
+  static_assert(sizeof(Int) == 4 || sizeof(Int) == 8);
+
+public:
+  /*!
+   * Constructs a new VarIntRunLengthDecoder.
+   *
+   * @param ptr The pointer to the memory location where the encoded integers are stored.
+   */
+  VarIntRunLengthDecoder(const std::uint8_t *ptr) : _ptr(ptr) {}
+
+  /*!
+   * Decodes the encoded integers.
+   *
+   * @param max_decoded The amount of integers to decode.
+   * @param l The function to be called with the decoded integers, i.e. the function has one
+   * parameter of type Int.
+   */
+  template <typename Lambda> void decode(const std::size_t max_decoded, Lambda &&l) {
+    constexpr bool non_stoppable = std::is_void<std::invoke_result_t<Lambda, std::uint32_t>>::value;
+
+    std::size_t decoded = 0;
+    while (decoded < max_decoded) {
+      const std::uint8_t run_header = *_ptr++;
+
+      if constexpr (sizeof(Int) == 4) {
+        std::uint8_t run_length = (run_header >> 2) + 1;
+        const std::uint8_t run_size = (run_header & 0b00000011) + 1;
+
+        decoded += run_length;
+        if (decoded > max_decoded) {
+          run_length -= decoded - max_decoded;
+        }
+
+        if constexpr (non_stoppable) {
+          decode32(run_length, run_size, std::forward<Lambda>(l));
+        } else {
+          const bool stop = decode32(run_length, run_size, std::forward<Lambda>(l));
+          if (stop) {
+            return;
+          }
+        }
+      } else if constexpr (sizeof(Int) == 8) {
+        std::uint8_t run_length = (run_header >> 3) + 1;
+        const std::uint8_t run_size = (run_header & 0b00000111) + 1;
+
+        decoded += run_length;
+        if (decoded > max_decoded) {
+          run_length -= decoded - max_decoded;
+        }
+
+        if constexpr (non_stoppable) {
+          decode64(run_length, run_size, std::forward<Lambda>(l));
+        } else {
+          const bool stop = decode64(run_length, run_size, std::forward<Lambda>(l));
+          if (stop) {
+            return;
+          }
+        }
+      }
+    }
+  }
+
+private:
+  const std::uint8_t *_ptr;
+
+  template <typename Lambda>
+  bool decode32(const std::uint8_t run_length, const std::uint8_t run_size, Lambda &&l) {
+    constexpr bool non_stoppable = std::is_void<std::invoke_result_t<Lambda, std::uint32_t>>::value;
+
+    switch (run_size) {
+    case 1:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint32_t value = static_cast<std::uint32_t>(*_ptr);
+        _ptr += 1;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 2:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint32_t value = *((std::uint16_t *)_ptr);
+        _ptr += 2;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 3:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint32_t value = *((std::uint32_t *)_ptr) & 0xFFFFFF;
+        _ptr += 3;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 4:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint32_t value = *((std::uint32_t *)_ptr);
+        _ptr += 4;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    default:
+      throw std::runtime_error("unexpected run size");
+    }
+
+    return false;
+  }
+
+  template <typename Lambda>
+  bool decode64(const std::uint8_t run_length, const std::uint8_t run_size, Lambda &&l) {
+    constexpr bool non_stoppable = std::is_void<std::invoke_result_t<Lambda, std::uint64_t>>::value;
+
+    switch (run_size) {
+    case 1:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = static_cast<std::uint64_t>(*_ptr);
+        _ptr += 1;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 2:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = *((std::uint16_t *)_ptr);
+        _ptr += 2;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 3:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = *((std::uint32_t *)_ptr) & 0xFFFFFF;
+        _ptr += 3;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 4:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = *((std::uint32_t *)_ptr);
+        _ptr += 4;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 5:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFF;
+        _ptr += 5;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 6:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFFFF;
+        _ptr += 6;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 7:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = *((std::uint64_t *)_ptr) & 0xFFFFFFFFFFFFFF;
+        _ptr += 7;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    case 8:
+      for (std::uint8_t i = 0; i < run_length; ++i) {
+        std::uint64_t value = *((std::uint64_t *)_ptr);
+        _ptr += 8;
+
+        if constexpr (non_stoppable) {
+          l(value);
+        } else {
+          const bool stop = l(value);
+          if (stop) {
+            return true;
+          }
+        }
+      }
+      break;
+    default:
+      throw std::runtime_error("unexpected run size");
+    }
+
+    return false;
+  }
+};
+
+}; // namespace kaminpar
diff --git a/kaminpar-common/varint_stream_codec.h b/kaminpar-common/varint_stream_codec.h
new file mode 100644
index 00000000..f6db0742
--- /dev/null
+++ b/kaminpar-common/varint_stream_codec.h
@@ -0,0 +1,307 @@
+/*******************************************************************************
+ * Encoding and decoding methods for the StreamVByte codec.
+ *
+ * @file:   varint_stream_codec.h
+ * @author: Daniel Salwasser
+ * @date:   29.12.2023
+ ******************************************************************************/
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+#include <immintrin.h>
+
+#include "kaminpar-common/constexpr_utils.h"
+#include "kaminpar-common/varint_codec.h"
+
+namespace kaminpar {
+
+/*!
+ * An encoder for writing variable length integers with the StreamVByte codec.
+ *
+ * @tparam Int The type of integer to encode.
+ */
+template <typename Int> class VarIntStreamEncoder {
+  static_assert(sizeof(Int) == 4);
+
+public:
+  /*!
+   * Constructs a new VarIntStreamEncoder.
+   *
+   * @param ptr The pointer to the memory location where the encoded integers are written.
+   * @param count The amount of integers to encode.
+   */
+  VarIntStreamEncoder(std::uint8_t *ptr, std::size_t count)
+      : _control_bytes_ptr(ptr),
+        _data_ptr(ptr + count / 4 + ((count % 4) != 0)),
+        _count(count),
+        _buffered(0) {}
+
+  /*!
+   * Encodes an integer.
+   *
+   * @param i The integer to encode.
+   * @return The number of bytes that the integer requires to be stored in encoded format. It
+   * includes the control byte if it is the last integer of a block.
+   */
+  std::size_t add(Int i) {
+    if (_buffered == 3) {
+      _buffer[3] = i;
+      write_stream();
+
+      _buffered = 0;
+      return needed_bytes(i);
+    }
+
+    _buffer[_buffered] = i;
+    return needed_bytes(i) + (_buffered++ == 0);
+  }
+
+  /*!
+   * Writes the remaining integers added to the encoder which do not form a complete block to
+   * memory.
+   */
+  void flush() {
+    if (_buffered == 0) {
+      return;
+    }
+
+    const std::uint8_t control_byte =
+        ((needed_bytes(_buffer[3]) - 1) << 6) | (((needed_bytes(_buffer[2]) - 1) & 0b11) << 4) |
+        (((needed_bytes(_buffer[1]) - 1) & 0b11) << 2) | ((needed_bytes(_buffer[0]) - 1) & 0b11);
+    *_control_bytes_ptr++ = control_byte;
+
+    for (std::size_t i = 0; i < _buffered; ++i) {
+      Int value = _buffer[i];
+      do {
+        *_data_ptr++ = static_cast<std::uint8_t>(value);
+        value >>= 8;
+      } while (value > 0);
+    }
+  }
+
+private:
+  std::uint8_t *_control_bytes_ptr;
+  std::uint8_t *_data_ptr;
+  const std::size_t _count;
+
+  std::size_t _buffered;
+  std::array<Int, 4> _buffer;
+
+  void write_stream() {
+    const std::uint8_t control_byte =
+        ((needed_bytes(_buffer[3]) - 1) << 6) | (((needed_bytes(_buffer[2]) - 1) & 0b11) << 4) |
+        (((needed_bytes(_buffer[1]) - 1) & 0b11) << 2) | ((needed_bytes(_buffer[0]) - 1) & 0b11);
+    *_control_bytes_ptr++ = control_byte;
+
+    for (Int value : _buffer) {
+      do {
+        *_data_ptr++ = static_cast<std::uint8_t>(value);
+        value >>= 8;
+      } while (value > 0);
+    }
+  }
+
+  std::uint8_t needed_bytes(Int i) const {
+    std::size_t len = 1;
+
+    while (i > 0b11111111) {
+      i >>= 8;
+      len++;
+    }
+
+    return len;
+  }
+};
+
+/*!
+ * A decoder for reading variable length integers stored with the StreamVByte codec.
+ *
+ * @tparam Int The type of integer to decode.
+ */
+template <typename Int> class VarIntStreamDecoder {
+  static_assert(sizeof(Int) == 4);
+
+  static constexpr std::array<std::uint8_t, 256> create_length_table() {
+    std::array<std::uint8_t, 256> length_table{};
+
+    constexpr_for<256>([&](const std::uint8_t control_byte) {
+      length_table[control_byte] = 0;
+
+      constexpr_for<4>([&](const std::uint8_t i) {
+        const std::uint8_t length = ((control_byte >> (2 * i)) & 0b11) + 1;
+        length_table[control_byte] += length;
+      });
+    });
+
+    return length_table;
+  }
+
+  static constexpr std::array<std::array<std::uint8_t, 16>, 256> create_shuffle_table() {
+    std::array<std::array<std::uint8_t, 16>, 256> shuffle_table{};
+
+    constexpr_for<256>([&](const std::uint8_t control_byte) {
+      std::uint8_t byte = 0;
+      std::uint8_t pos = 0;
+
+      constexpr_for<4>([&](const std::uint8_t i) {
+        std::uint8_t c = (control_byte >> (2 * i)) & 0b11;
+
+        std::uint8_t j = 0;
+        while (j <= c) {
+          shuffle_table[control_byte][pos++] = byte++;
+          j += 1;
+        }
+
+        while (j < 4) {
+          shuffle_table[control_byte][pos++] = 0b11111111;
+          j += 1;
+        }
+      });
+    });
+
+    return shuffle_table;
+  }
+
+  static const constexpr std::array<std::uint8_t, 256> kLengthTable = create_length_table();
+
+  static const constexpr std::array<std::array<std::uint8_t, 16>, 256> kShuffleTable =
+      create_shuffle_table();
+
+public:
+  /*!
+   * Constructs a new VarIntStreamDecoder.
+   *
+   * @param ptr The pointer to the memory location where the encoded integers are stored.
+   * @param count The amount of integers that are stored at the memory location.
+   */
+  VarIntStreamDecoder(const std::uint8_t *ptr, const std::size_t count)
+      : _control_bytes_ptr(ptr),
+        _control_bytes(count / 4),
+        _data_ptr(ptr + _control_bytes + ((count % 4) != 0)),
+        _count(count) {}
+
+  /*!
+   * Decodes the encoded integers.
+   *
+   * @param max_count The amount of integers to decode, it has to be less then the amount of
+   * integers stored that are stored.
+   * @param l The function to be called with the decoded integers, i.e. the function has one
+   * parameter of type Int.
+   */
+  template <typename Lambda> void decode(const std::size_t max_count, Lambda &&l) {
+    constexpr bool non_stoppable = std::is_void<std::invoke_result_t<Lambda, std::uint32_t>>::value;
+
+    // max_count = std::min(max_count, _count);
+
+    const std::size_t control_bytes = max_count / 4;
+    for (std::size_t i = 0; i < control_bytes; ++i) {
+      const std::uint8_t control_byte = _control_bytes_ptr[i];
+      const std::uint8_t length = kLengthTable[control_byte];
+
+      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+      _data_ptr += length;
+
+      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
+
+      if constexpr (non_stoppable) {
+        l(_mm_extract_epi32(data, 0));
+        l(_mm_extract_epi32(data, 1));
+        l(_mm_extract_epi32(data, 2));
+        l(_mm_extract_epi32(data, 3));
+      } else {
+        if (l(_mm_extract_epi32(data, 0))) {
+          return;
+        }
+
+        if (l(_mm_extract_epi32(data, 1))) {
+          return;
+        }
+
+        if (l(_mm_extract_epi32(data, 2))) {
+          return;
+        }
+
+        if (l(_mm_extract_epi32(data, 3))) {
+          return;
+        }
+      }
+    }
+
+    switch (max_count % 4) {
+    case 1: {
+      const std::uint8_t control_byte = _control_bytes_ptr[control_bytes];
+      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
+
+      if constexpr (non_stoppable) {
+        l(_mm_extract_epi32(data, 0));
+      } else {
+        if (l(_mm_extract_epi32(data, 0))) {
+          return;
+        }
+      }
+      break;
+    }
+    case 2: {
+      const std::uint8_t control_byte = _control_bytes_ptr[control_bytes];
+      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
+
+      if constexpr (non_stoppable) {
+        l(_mm_extract_epi32(data, 0));
+        l(_mm_extract_epi32(data, 1));
+      } else {
+        if (l(_mm_extract_epi32(data, 0))) {
+          return;
+        }
+
+        if (l(_mm_extract_epi32(data, 1))) {
+          return;
+        }
+      }
+      break;
+    }
+    case 3: {
+      const std::uint8_t control_byte = _control_bytes_ptr[control_bytes];
+      const std::uint8_t *shuffle_mask = kShuffleTable[control_byte].data();
+
+      __m128i data = _mm_loadu_si128((const __m128i *)_data_ptr);
+      data = _mm_shuffle_epi8(data, *(const __m128i *)shuffle_mask);
+
+      if constexpr (non_stoppable) {
+        l(_mm_extract_epi32(data, 0));
+        l(_mm_extract_epi32(data, 1));
+        l(_mm_extract_epi32(data, 2));
+      } else {
+        if (l(_mm_extract_epi32(data, 0))) {
+          return;
+        }
+
+        if (l(_mm_extract_epi32(data, 1))) {
+          return;
+        }
+
+        if (l(_mm_extract_epi32(data, 2))) {
+          return;
+        }
+      }
+      break;
+    }
+    }
+  }
+
+private:
+  const std::uint8_t *_control_bytes_ptr;
+  const std::size_t _control_bytes;
+  const std::uint8_t *_data_ptr;
+  const std::size_t _count;
+};
+
+} // namespace kaminpar
diff --git a/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc b/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc
index 6d16afae..1cf875bf 100644
--- a/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc
+++ b/kaminpar-dist/coarsening/clustering/lp/global_lp_clusterer.cc
@@ -9,14 +9,13 @@
 
 #include <google/dense_hash_map>
 
+#include "kaminpar-mpi/sparse_alltoall.h"
+
 #include "kaminpar-dist/datastructures/distributed_graph.h"
 #include "kaminpar-dist/datastructures/growt.h"
+#include "kaminpar-dist/distributed_label_propagation.h"
 #include "kaminpar-dist/graphutils/communication.h"
-
-#include "kaminpar-shm/label_propagation.h"
-
-#include "kaminpar-common/datastructures/fast_reset_array.h"
-#include "kaminpar-common/math.h"
+#include "kaminpar-dist/timer.h"
 
 namespace kaminpar::dist {
 namespace {
@@ -49,8 +48,8 @@ struct UnorderedRatingMap {
 };
 
 struct GlobalLPClusteringConfig : public LabelPropagationConfig {
-  using Graph = DistributedGraph;
   using RatingMap = ::kaminpar::RatingMap<EdgeWeight, GlobalNodeID, UnorderedRatingMap>;
+
   using ClusterID = GlobalNodeID;
   using ClusterWeight = GlobalNodeWeight;
 
@@ -70,8 +69,6 @@ class GlobalLPClusteringImpl final
   using ClusterBase = NonatomicOwnedClusterVector<NodeID, GlobalNodeID>;
   using WeightDeltaMap = growt::GlobalNodeIDMap<GlobalNodeWeight>;
 
-  struct Statistics {};
-
 public:
   explicit GlobalLPClusteringImpl(const Context &ctx)
       : ClusterBase{ctx.partition.graph->total_n},
@@ -547,9 +544,7 @@ class GlobalLPClusteringImpl final
         from,
         to,
         [&](const NodeID lnode) { return _changed_label[lnode] != kInvalidGlobalNodeID; },
-        [&](const NodeID lnode) -> ChangedLabelMessage {
-          return {lnode, cluster(lnode)};
-        },
+        [&](const NodeID lnode) -> ChangedLabelMessage { return {lnode, cluster(lnode)}; },
         [&](const auto &buffer, const PEID owner) {
           tbb::parallel_for(tbb::blocked_range<std::size_t>(0, buffer.size()), [&](const auto &r) {
             auto &weight_delta_handle = _weight_delta_handles_ets.local();
diff --git a/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc b/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc
index 87a75cbb..3a4e279a 100644
--- a/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc
+++ b/kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.cc
@@ -8,13 +8,13 @@
  ******************************************************************************/
 #include "kaminpar-dist/coarsening/clustering/lp/local_lp_clusterer.h"
 
-#include "kaminpar-shm/label_propagation.h"
+#include "kaminpar-dist/distributed_label_propagation.h"
 
 namespace kaminpar::dist {
 struct LocalLPClusteringConfig : public LabelPropagationConfig {
-  using Graph = DistributedGraph;
   using ClusterID = NodeID;
   using ClusterWeight = NodeWeight;
+
   static constexpr bool kTrackClusterCount = false;
   static constexpr bool kUseTwoHopClustering = true;
 };
diff --git a/kaminpar-dist/coarsening/coarsener.cc b/kaminpar-dist/coarsening/coarsener.cc
index 5c14b27e..1c6ca95f 100644
--- a/kaminpar-dist/coarsening/coarsener.cc
+++ b/kaminpar-dist/coarsening/coarsener.cc
@@ -9,13 +9,11 @@
 
 #include "kaminpar-dist/coarsening/contraction/cluster_contraction.h"
 #include "kaminpar-dist/coarsening/contraction/local_cluster_contraction.h"
-#include "kaminpar-dist/context.h"
 #include "kaminpar-dist/datastructures/distributed_graph.h"
 #include "kaminpar-dist/datastructures/distributed_partitioned_graph.h"
 #include "kaminpar-dist/factories.h"
 
-#include "kaminpar-shm/context.h"
-#include "kaminpar-shm/partition_utils.h"
+#include "kaminpar-shm/coarsening/max_cluster_weights.h"
 
 namespace kaminpar::dist {
 SET_DEBUG(false);
@@ -194,11 +192,11 @@ const DistributedGraph *Coarsener::nth_coarsest(const std::size_t n) const {
 GlobalNodeWeight Coarsener::max_cluster_weight() const {
   const auto *graph = coarsest();
 
-  return shm::compute_max_cluster_weight(
+  return shm::compute_max_cluster_weight<GlobalNodeWeight>(
       _input_ctx.coarsening,
+      _input_ctx.partition,
       graph->global_n(),
-      graph->global_total_node_weight(),
-      _input_ctx.partition
+      graph->global_total_node_weight()
   );
 }
 } // namespace kaminpar::dist
diff --git a/kaminpar-dist/coarsening/contraction/cluster_contraction.cc b/kaminpar-dist/coarsening/contraction/cluster_contraction.cc
index 79894cc1..f11ac8f4 100644
--- a/kaminpar-dist/coarsening/contraction/cluster_contraction.cc
+++ b/kaminpar-dist/coarsening/contraction/cluster_contraction.cc
@@ -104,7 +104,8 @@ find_nonlocal_nodes(const DistributedGraph &graph, const GlobalClustering &lnode
     const GlobalNodeID gcluster = lnode_to_gcluster[lnode];
     if (!graph.is_owned_global_node(gcluster)) {
       nonlocal_nodes[node_position_buffer[lnode]] = {
-          .u = gcluster, .weight = graph.node_weight(lnode)};
+          .u = gcluster, .weight = graph.node_weight(lnode)
+      };
     }
   });
 
@@ -224,9 +225,7 @@ void update_ghost_node_weights(DistributedGraph &graph) {
 
   mpi::graph::sparse_alltoall_interface_to_pe<Message>(
       graph,
-      [&](const NodeID u) -> Message {
-        return {u, graph.node_weight(u)};
-      },
+      [&](const NodeID u) -> Message { return {u, graph.node_weight(u)}; },
       [&](const auto buffer, const PEID pe) {
         tbb::parallel_for<std::size_t>(0, buffer.size(), [&](const std::size_t i) {
           const auto &[local_node_on_other_pe, weight] = buffer[i];
@@ -424,7 +423,8 @@ MigrationResult<Element> migrate_elements(
       .sendcounts = std::move(sendcounts),
       .sdispls = std::move(sdispls),
       .recvcounts = std::move(recvcounts),
-      .rdispls = std::move(rdispls)};
+      .rdispls = std::move(rdispls)
+  };
 }
 
 MigrationResult<GlobalNode>
@@ -816,9 +816,7 @@ void rebalance_cluster_placement(
   };
   mpi::graph::sparse_alltoall_interface_to_pe<Message>(
       graph,
-      [&](const NodeID lnode) -> Message {
-        return {lnode, lnode_to_gcluster[lnode]};
-      },
+      [&](const NodeID lnode) -> Message { return {lnode, lnode_to_gcluster[lnode]}; },
       [&](const auto buffer, const PEID pe) {
         tbb::parallel_for<std::size_t>(0, buffer.size(), [&](const std::size_t i) {
           const auto &[their_lnode, new_gcluster] = buffer[i];
@@ -862,8 +860,8 @@ bool validate_clustering(const DistributedGraph &graph, const GlobalClustering &
           const NodeID lnode = graph.global_to_local_node(gnode);
           if (lnode_to_gcluster[lnode] != gcluster) {
             LOG_WARNING << "Inconsistent cluster for local node " << lnode
-                        << " (ghost node, global node ID " << gnode << "): "
-                        << "the node is owned by PE " << pe
+                        << " (ghost node, global node ID " << gnode
+                        << "): " << "the node is owned by PE " << pe
                         << ", which assigned the node to cluster " << gcluster
                         << ", but our ghost node is assigned to cluster "
                         << lnode_to_gcluster[lnode] << "; aborting";
@@ -1322,7 +1320,9 @@ ContractionResult contract_clustering(
   // Finally, build coarse graph
   START_TIMER("Construct coarse graph");
   auto all_buffered_nodes =
-      ts_navigable_list::combine<NodeID, LocalEdge, scalable_vector>(edge_buffer_ets);
+      ts_navigable_list::combine<NodeID, LocalEdge, scalable_vector, scalable_vector>(
+          edge_buffer_ets
+      );
 
   tbb::parallel_for<NodeID>(0, c_n, [&](const NodeID i) {
     const auto &marker = all_buffered_nodes[i];
@@ -1456,9 +1456,7 @@ DistributedPartitionedGraph project_partition(
 
   mpi::graph::sparse_alltoall_interface_to_pe<GhostNodeLabel>(
       graph,
-      [&](const NodeID lnode) -> GhostNodeLabel {
-        return {lnode, partition[lnode]};
-      },
+      [&](const NodeID lnode) -> GhostNodeLabel { return {lnode, partition[lnode]}; },
       [&](const auto buffer, const PEID pe) {
         tbb::parallel_for<std::size_t>(0, buffer.size(), [&](const std::size_t i) {
           const auto &[sender_lnode, block] = buffer[i];
diff --git a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc
index 3f21e6ae..12266c35 100644
--- a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc
+++ b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.cc
@@ -272,7 +272,8 @@ Result contract_local_clustering(
       std::move(c_ghost_to_global),
       std::move(c_global_to_ghost),
       false,
-      graph.communicator()};
+      graph.communicator()
+  };
 
   return {std::move(c_graph), std::move(mapping), std::move(m_ctx)};
 }
diff --git a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h
index fbe98301..8925c519 100644
--- a/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h
+++ b/kaminpar-dist/coarsening/contraction/local_cluster_contraction.h
@@ -24,7 +24,7 @@ struct MemoryContext {
   scalable_vector<NodeID> buckets;
   scalable_vector<parallel::Atomic<NodeID>> buckets_index;
   scalable_vector<parallel::Atomic<NodeID>> leader_mapping;
-  scalable_vector<NavigationMarker<NodeID, Edge, scalable_vector>> all_buffered_nodes;
+  StaticArray<NavigationMarker<NodeID, Edge, scalable_vector>> all_buffered_nodes;
 };
 
 struct Result {
diff --git a/kaminpar-dist/datastructures/distributed_graph.cc b/kaminpar-dist/datastructures/distributed_graph.cc
index c0dc1e15..1a7d07fd 100644
--- a/kaminpar-dist/datastructures/distributed_graph.cc
+++ b/kaminpar-dist/datastructures/distributed_graph.cc
@@ -230,7 +230,7 @@ void print_local_graph_stats(const DistributedGraph &graph) {
   std::fill(buckets.begin(), buckets.end(), 0);
 
   EdgeID local_m = 0, nonlocal_m = 0;
-  EdgeID min_deg = std::numeric_limits<EdgeID>::max(), max_deg = 0;
+  NodeID min_deg = std::numeric_limits<NodeID>::max(), max_deg = 0;
   for (NodeID u = 0; u < graph.n(); ++u) {
     for (const auto [e, v] : graph.neighbors(u)) {
       if (graph.is_owned_node(v)) {
diff --git a/kaminpar-dist/datastructures/ghost_node_mapper.h b/kaminpar-dist/datastructures/ghost_node_mapper.h
index 09d5d4f2..db259a69 100644
--- a/kaminpar-dist/datastructures/ghost_node_mapper.h
+++ b/kaminpar-dist/datastructures/ghost_node_mapper.h
@@ -9,14 +9,13 @@
 
 #include <tbb/concurrent_hash_map.h>
 
-#include "kaminpar-mpi/wrapper.h"
-
 #include "kaminpar-dist/datastructures/growt.h"
 #include "kaminpar-dist/dkaminpar.h"
-#include "kaminpar-dist/logger.h"
 
 #include "kaminpar-common/assert.h"
 #include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/logger.h"
+#include "kaminpar-common/parallel/atomic.h"
 
 namespace kaminpar::dist::graph {
 class GhostNodeMapper {
@@ -87,7 +86,8 @@ class GhostNodeMapper {
     return {
         .global_to_ghost = std::move(global_to_ghost),
         .ghost_to_global = std::move(ghost_to_global),
-        .ghost_owner = std::move(ghost_owner)};
+        .ghost_owner = std::move(ghost_owner)
+    };
   }
 
 private:
diff --git a/kaminpar-dist/distributed_label_propagation.h b/kaminpar-dist/distributed_label_propagation.h
new file mode 100644
index 00000000..9e181ced
--- /dev/null
+++ b/kaminpar-dist/distributed_label_propagation.h
@@ -0,0 +1,1310 @@
+/*******************************************************************************
+ * Generic implementation of parallel label propagation.
+ *
+ * @file:   parallel_label_propagation.h
+ * @author: Daniel Seemaier
+ * @date:   21.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include <atomic>
+#include <type_traits>
+
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_invoke.h>
+#include <tbb/scalable_allocator.h>
+
+#include "kaminpar-dist/datastructures/distributed_graph.h"
+
+#include "kaminpar-common/assert.h"
+#include "kaminpar-common/datastructures/dynamic_map.h"
+#include "kaminpar-common/datastructures/rating_map.h"
+#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/logger.h"
+#include "kaminpar-common/parallel/atomic.h"
+#include "kaminpar-common/random.h"
+#include "kaminpar-common/tags.h"
+
+namespace kaminpar::dist {
+struct LabelPropagationConfig {
+  using Graph = DistributedGraph;
+
+  // Data structure used to accumulate edge weights for gain value calculation
+  using RatingMap = ::kaminpar::RatingMap<EdgeWeight, NodeID, FastResetArray<EdgeWeight>>;
+
+  // Data type for cluster IDs and weights
+  using ClusterID = tag::Mandatory;
+  using ClusterWeight = tag::Mandatory;
+
+  // Approx. number of edges per work unit
+  static constexpr shm::NodeID kMinChunkSize = 1024;
+
+  // Nodes per permutation unit: when iterating over nodes in a chunk, we divide
+  // them into permutation units, iterate over permutation orders in random
+  // order, and iterate over nodes inside a permutation unit in random order.
+  static constexpr shm::NodeID kPermutationSize = 64;
+
+  // When randomizing the node order inside a permutation unit, we pick a random
+  // permutation from a pool of permutations. This constant determines the pool
+  // size.
+  static constexpr std::size_t kNumberOfNodePermutations = 64;
+
+  // If true, we count the number of empty clusters
+  static constexpr bool kTrackClusterCount = false;
+
+  // If true, match singleton clusters in 2-hop distance
+  static constexpr bool kUseTwoHopClustering = false;
+
+  static constexpr bool kUseActualGain = false;
+
+  static constexpr bool kUseActiveSetStrategy = true;
+  static constexpr bool kUseLocalActiveSetStrategy = false;
+};
+
+/*!
+ * Generic implementation of parallel label propagation. To use, inherit from
+ * this class and implement all mandatory template functions.
+ *
+ * @tparam Derived Derived class for static polymorphism.
+ * @tparam Config Algorithmic configuration and data types.
+ */
+template <typename Derived, typename Config> class LabelPropagation {
+  static_assert(std::is_base_of_v<LabelPropagationConfig, Config>);
+
+  SET_DEBUG(false);
+  SET_STATISTICS_FROM_GLOBAL();
+
+protected:
+  using RatingMap = typename Config::RatingMap;
+  using Graph = typename Config::Graph;
+  using NodeID = typename Graph::NodeID;
+  using NodeWeight = typename Graph::NodeWeight;
+  using EdgeID = typename Graph::EdgeID;
+  using EdgeWeight = typename Graph::EdgeWeight;
+  using ClusterID = typename Config::ClusterID;
+  using ClusterWeight = typename Config::ClusterWeight;
+
+public:
+  void set_max_degree(const NodeID max_degree) {
+    _max_degree = max_degree;
+  }
+  [[nodiscard]] NodeID max_degree() const {
+    return _max_degree;
+  }
+
+  void set_max_num_neighbors(const ClusterID max_num_neighbors) {
+    _max_num_neighbors = max_num_neighbors;
+  }
+  [[nodiscard]] ClusterID max_num_neighbors() const {
+    return _max_num_neighbors;
+  }
+
+  void set_desired_num_clusters(const ClusterID desired_num_clusters) {
+    _desired_num_clusters = desired_num_clusters;
+  }
+  [[nodiscard]] ClusterID desired_num_clusters() const {
+    return _desired_num_clusters;
+  }
+
+  [[nodiscard]] EdgeWeight expected_total_gain() const {
+    return _expected_total_gain;
+  }
+
+protected:
+  /*!
+   * (Re)allocates memory to run label propagation on a graph with \c num_nodes
+   * nodes.
+   * @param num_nodes Number of nodes in the graph.
+   */
+  void allocate(const NodeID num_nodes, const ClusterID num_clusters) {
+    allocate(num_nodes, num_nodes, num_clusters);
+  }
+
+  /*!
+   * (Re)allocates memory to run label propagation on a graph with \c num_nodes
+   * nodes in total, but a clustering is only computed for the first \c
+   * num_active_nodes nodes.
+   *
+   * This is mostly useful for distributed graphs where ghost nodes are always
+   * inactive.
+   *
+   * @param num_nodes Total number of nodes in the graph, i.e., neighbors of
+   * active nodes have an ID less than this.
+   * @param num_active_nodes Number of nodes for which a cluster label is
+   * computed.
+   */
+  void allocate(const NodeID num_nodes, const NodeID num_active_nodes, const NodeID num_clusters) {
+    if (_num_nodes < num_nodes) {
+      if constexpr (Config::kUseLocalActiveSetStrategy) {
+        _active.resize(num_nodes);
+      }
+      _num_nodes = num_nodes;
+    }
+
+    if (_num_active_nodes < num_active_nodes) {
+      if constexpr (Config::kUseActiveSetStrategy) {
+        _active.resize(num_active_nodes);
+      }
+      if constexpr (Config::kUseTwoHopClustering) {
+        _favored_clusters.resize(num_active_nodes);
+      }
+      _num_active_nodes = num_active_nodes;
+    }
+    if (_num_clusters < num_clusters) {
+      for (auto &rating_map : _rating_map_ets) {
+        rating_map.change_max_size(num_clusters);
+      }
+      _num_clusters = num_clusters;
+    }
+  }
+
+  /*!
+   * Initialize label propagation. Must be called after \c allocate().
+   * @param graph Graph for label propagation.
+   * @param num_clusters Number of different clusters the nodes are placed in
+   * initially. When using label propagation as refinement graphutils, this is
+   * usually the number of blocks. When using as for clustering, it is usually
+   * the number of nodes.
+   */
+  void initialize(const Graph *graph, const ClusterID num_clusters) {
+    KASSERT(
+        graph->n() == 0 || (_num_nodes > 0u && _num_active_nodes > 0u),
+        "you must call allocate() before initialize()"
+    );
+
+    _graph = graph;
+    _initial_num_clusters = num_clusters;
+    _current_num_clusters = num_clusters;
+    reset_state();
+  }
+
+  /*!
+   * Determines whether we should stop label propagation because the number of
+   * non-empty clusters has been reduced sufficiently.
+   * @return Whether label propagation should be stopped now.
+   */
+  bool should_stop() {
+    if (Config::kTrackClusterCount) {
+      return _current_num_clusters <= _desired_num_clusters;
+    }
+    return false;
+  }
+
+  /*!
+   * Move a single node to a new cluster.
+   *
+   * @param u The node that is moved.
+   * @param local_rand Thread-local \c Random object.
+   * @param local_rating_map Thread-local rating map for gain computation.
+   * @return Pair with: whether the node was moved to another cluster, whether
+   * the previous cluster is now empty.
+   */
+  template <typename LocalRatingMap>
+  std::pair<bool, bool>
+  handle_node(const NodeID u, Random &local_rand, LocalRatingMap &local_rating_map) {
+    if (derived_skip_node(u)) {
+      return {false, false};
+    }
+
+    const NodeWeight u_weight = _graph->node_weight(u);
+    const ClusterID u_cluster = derived_cluster(u);
+    const auto [new_cluster, new_gain] =
+        find_best_cluster(u, u_weight, u_cluster, local_rand, local_rating_map);
+
+    if (derived_cluster(u) != new_cluster) {
+      if (derived_move_cluster_weight(
+              u_cluster, new_cluster, u_weight, derived_max_cluster_weight(new_cluster)
+          )) {
+        derived_move_node(u, new_cluster);
+        activate_neighbors(u);
+        IFSTATS(_expected_total_gain += new_gain);
+
+        const bool decrement_cluster_count =
+            Config::kTrackClusterCount && derived_cluster_weight(u_cluster) == 0;
+        // do not update _current_num_clusters here to avoid fetch_add()
+        return {true, decrement_cluster_count}; // did move, did reduce nonempty
+                                                // cluster count?
+      }
+    }
+
+    // did not move, did not reduce cluster count
+    return {false, false};
+  }
+
+  struct ClusterSelectionState {
+    Random &local_rand;
+    NodeID u;
+    NodeWeight u_weight;
+    ClusterID initial_cluster;
+    ClusterWeight initial_cluster_weight;
+    ClusterID best_cluster;
+    EdgeWeight best_gain;
+    ClusterWeight best_cluster_weight;
+    ClusterID current_cluster;
+    EdgeWeight current_gain;
+    ClusterWeight current_cluster_weight;
+  };
+
+  /*!
+   * Computes the best feasible cluster for a node.
+   *
+   * @param u The node for which the cluster is computed.
+   * @param u_weight The weight of the node.
+   * @param u_cluster The current cluster of the node.
+   * @param local_rand Thread-local \c Random object.
+   * @param local_rating_map Thread-local rating map to compute gain values.
+   * @return Pair with: new cluster of the node, gain value for the move to the
+   * new cluster.
+   */
+  template <typename LocalRatingMap>
+  std::pair<ClusterID, EdgeWeight> find_best_cluster(
+      const NodeID u,
+      const NodeWeight u_weight,
+      const ClusterID u_cluster,
+      Random &local_rand,
+      LocalRatingMap &local_rating_map
+  ) {
+    auto action = [&](auto &map) {
+      const ClusterWeight initial_cluster_weight = derived_cluster_weight(u_cluster);
+      ClusterSelectionState state{
+          .local_rand = local_rand,
+          .u = u,
+          .u_weight = u_weight,
+          .initial_cluster = u_cluster,
+          .initial_cluster_weight = initial_cluster_weight,
+          .best_cluster = u_cluster,
+          .best_gain = 0,
+          .best_cluster_weight = initial_cluster_weight,
+          .current_cluster = 0,
+          .current_gain = 0,
+          .current_cluster_weight = 0,
+      };
+
+      bool is_interface_node = false;
+
+      auto add_to_rating_map = [&](const EdgeID e, const NodeID v) {
+        if (derived_accept_neighbor(u, v)) {
+          const ClusterID v_cluster = derived_cluster(v);
+          const EdgeWeight rating = _graph->edge_weight(e);
+          map[v_cluster] += rating;
+          if constexpr (Config::kUseLocalActiveSetStrategy) {
+            is_interface_node |= v >= _num_active_nodes;
+          }
+        }
+      };
+
+      const EdgeID from = _graph->first_edge(u);
+      const EdgeID to = from + std::min(_graph->degree(u), _max_num_neighbors);
+      for (EdgeID e = from; e < to; ++e) {
+        add_to_rating_map(e, _graph->edge_target(e));
+      }
+
+      if constexpr (Config::kUseLocalActiveSetStrategy) {
+        if (!is_interface_node) {
+          _active[u] = 0;
+        }
+      }
+      if constexpr (Config::kUseActiveSetStrategy) {
+        _active[u] = 0;
+      }
+
+      // After LP, we might want to use 2-hop clustering to merge nodes that
+      // could not find any cluster to join for this, we store a favored cluster
+      // for each node u if:
+      // (1) we actually use 2-hop clustering
+      // (2) u is still in a singleton cluster (weight of node == weight of cluster)
+      // (3) the cluster is light (at most half full)
+      ClusterID favored_cluster = u_cluster;
+      const bool store_favored_cluster =
+          Config::kUseTwoHopClustering && u_weight == initial_cluster_weight &&
+          initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2;
+
+      const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0;
+
+      for (const auto [cluster, rating] : map.entries()) {
+        state.current_cluster = cluster;
+        state.current_gain = rating - gain_delta;
+        state.current_cluster_weight = derived_cluster_weight(cluster);
+
+        if (store_favored_cluster && state.current_gain > state.best_gain) {
+          favored_cluster = state.current_cluster;
+        }
+
+        if (derived_accept_cluster(state)) {
+          state.best_cluster = state.current_cluster;
+          state.best_cluster_weight = state.current_cluster_weight;
+          state.best_gain = state.current_gain;
+        }
+      }
+
+      // if we couldn't join any cluster, we store the favored cluster
+      if (store_favored_cluster && state.best_cluster == state.initial_cluster) {
+        _favored_clusters[u] = favored_cluster;
+      }
+
+      const EdgeWeight actual_gain = IFSTATS(state.best_gain - map[state.initial_cluster]);
+      map.clear();
+      return std::make_pair(state.best_cluster, actual_gain);
+    };
+
+    const auto [best_cluster, gain] = local_rating_map.execute(
+        std::min<ClusterID>(_graph->degree(u), _initial_num_clusters), action
+    );
+
+    return {best_cluster, gain};
+  }
+
+  /*!
+   * Flags neighbors of a node that has been moved as active.
+   *
+   * @param u Node that was moved.
+   */
+  void activate_neighbors(const NodeID u) {
+    for (const NodeID v : _graph->adjacent_nodes(u)) {
+      // call derived_activate_neighbor() even if we do not use the active set
+      // strategy since the function might have side effects; the compiler
+      // should remove it if it does not side effects
+      if (derived_activate_neighbor(v)) {
+        if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
+          _active[v].store(1, std::memory_order_relaxed);
+        }
+      }
+    }
+  }
+
+  void match_isolated_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_isolated_nodes_impl<true>(from, to);
+  }
+
+  void cluster_isolated_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_isolated_nodes_impl<false>(from, to);
+  }
+
+  template <bool match>
+  void handle_isolated_nodes_impl(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    constexpr ClusterID kInvalidClusterID = std::numeric_limits<ClusterID>::max();
+    tbb::enumerable_thread_specific<ClusterID> current_cluster_ets(kInvalidClusterID);
+
+    tbb::parallel_for(
+        tbb::blocked_range<NodeID>(from, std::min(_graph->n(), to)),
+        [&](tbb::blocked_range<NodeID> r) {
+          ClusterID cluster = current_cluster_ets.local();
+
+          for (NodeID u = r.begin(); u != r.end(); ++u) {
+            if (_graph->degree(u) == 0) {
+              const ClusterID cu = derived_cluster(u);
+
+              if (cluster != kInvalidClusterID &&
+                  derived_move_cluster_weight(
+                      cu, cluster, derived_cluster_weight(cu), derived_max_cluster_weight(cluster)
+                  )) {
+                derived_move_node(u, cluster);
+                if constexpr (match) {
+                  cluster = kInvalidClusterID;
+                }
+              } else {
+                cluster = cu;
+              }
+            }
+          }
+
+          current_cluster_ets.local() = cluster;
+        }
+    );
+  }
+
+  void match_two_hop_nodes_threadwise(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_threadwise_impl<true>(from, to);
+  }
+
+  void cluster_two_hop_nodes_threadwise(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_threadwise_impl<false>(from, to);
+  }
+
+  template <bool match>
+  void handle_two_hop_nodes_threadwise_impl(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    static_assert(Config::kUseTwoHopClustering, "2-hop clustering is disabled");
+
+    tbb::enumerable_thread_specific<DynamicFlatMap<ClusterID, NodeID>> matching_map_ets;
+
+    auto is_considered_for_two_hop_clustering = [&](const NodeID u) {
+      // Skip nodes not considered for two-hop clustering
+      if (_graph->degree(u) == 0) {
+        // Not considered: isolated node
+        return false;
+      } else if (u != derived_cluster(u)) {
+        // Not considered: joined another cluster
+        return false;
+      } else {
+        // If u did not join another cluster, there could still be other nodes that joined this
+        // node's cluster: find out by checking the cluster weight
+        const ClusterWeight current_weight = derived_cluster_weight(u);
+        if (current_weight > derived_max_cluster_weight(u) / 2 ||
+            current_weight != derived_initial_cluster_weight(u)) {
+          // Not considered: not a singleton cluster; or its weight is too heavy
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    auto handle_node = [&](DynamicFlatMap<ClusterID, NodeID> &matching_map, const NodeID u) {
+      ClusterID &rep_key = matching_map[_favored_clusters[u]];
+
+      if (rep_key == 0) {
+        rep_key = u + 1;
+      } else {
+        const ClusterID rep = rep_key - 1;
+
+        const bool could_move_u_to_rep = derived_move_cluster_weight(
+            u, rep, derived_cluster_weight(u), derived_max_cluster_weight(rep)
+        );
+
+        if constexpr (match) {
+          KASSERT(could_move_u_to_rep);
+          derived_move_node(u, rep);
+          rep_key = 0;
+        } else {
+          if (could_move_u_to_rep) {
+            derived_move_node(u, rep);
+          } else {
+            rep_key = u + 1;
+          }
+        }
+      }
+    };
+
+    tbb::parallel_for(
+        tbb::blocked_range<NodeID>(from, std::min(to, _graph->n()), 512),
+        [&](const tbb::blocked_range<NodeID> &r) {
+          auto &matching_map = matching_map_ets.local();
+
+          for (NodeID u = r.begin(); u != r.end(); ++u) {
+            if (is_considered_for_two_hop_clustering(u)) {
+              handle_node(matching_map, u);
+            }
+          }
+        }
+    );
+  }
+
+  void match_two_hop_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_impl<true>(from, to);
+  }
+
+  void cluster_two_hop_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_impl<false>(from, to);
+  }
+
+  template <bool match>
+  void handle_two_hop_nodes_impl(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    static_assert(Config::kUseTwoHopClustering, "2-hop clustering is disabled");
+
+    auto is_considered_for_two_hop_clustering = [&](const NodeID u) {
+      // Skip nodes not considered for two-hop clustering
+      if (_graph->degree(u) == 0) {
+        // Not considered: isolated node
+        return false;
+      } else if (u != derived_cluster(u)) {
+        // Not considered: joined another cluster
+        return false;
+      } else {
+        // If u did not join another cluster, there could still be other nodes that joined this
+        // node's cluster: find out by checking the cluster weight
+        const ClusterWeight current_weight = derived_cluster_weight(u);
+        if (current_weight > derived_max_cluster_weight(u) / 2 ||
+            current_weight != derived_initial_cluster_weight(u)) {
+          // Not considered: not a singleton cluster; or its weight is too heavy
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    // There could be edge cases where the favorite cluster of a node is itself a singleton cluster
+    // (for instance, if a node joins another cluster during the first round, but moves out of the
+    // cluster in the next round)
+    // Since the following code is based on the ansumption that the favorite cluster of a node that
+    // is considered for two-hop clustering it itself not considere for two-hop clustering, we fix
+    // this situation by moving the nodes to their favorite cluster, if possible, here.
+    tbb::parallel_for(from, std::min(to, _graph->n()), [&](const NodeID u) {
+      if (is_considered_for_two_hop_clustering(u)) {
+        const NodeID cluster = _favored_clusters[u];
+        if (is_considered_for_two_hop_clustering(cluster) &&
+            derived_move_cluster_weight(
+                u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster)
+            )) {
+          derived_move_node(u, cluster);
+          --_current_num_clusters;
+        }
+      } else {
+        _favored_clusters[u] = u;
+      }
+    });
+
+    KASSERT(
+        [&] {
+          for (NodeID u = from; u < std::min(to, _graph->n()); ++u) {
+            if (_favored_clusters[u] >= _graph->n()) {
+              LOG_WARNING << "favored cluster of node " << u
+                          << " out of bounds: " << _favored_clusters[u] << " > " << _graph->n();
+            }
+            if (u != _favored_clusters[u] && is_considered_for_two_hop_clustering(u) &&
+                is_considered_for_two_hop_clustering(_favored_clusters[u])) {
+              LOG_WARNING << "node " << u << " (degree " << _graph->degree(u) << " )"
+                          << " is considered for two-hop clustering, but its favored cluster "
+                          << _favored_clusters[u] << " (degree "
+                          << _graph->degree(_favored_clusters[u])
+                          << ") is also considered for two-hop clustering";
+              return false;
+            }
+          }
+          return true;
+        }(),
+        "precondition for two-hop clustering violated: found favored clusters that could be joined",
+        assert::heavy
+    );
+
+    // During label propagation, we store the best cluster for each node in _favored_cluster[]
+    // regardless of whether there is enough space in the cluster for the node to join.
+    // We now use this information to merge nodes that could not join any cluster, i.e.,
+    // singleton-clusters by clustering or matching nodes that have favored cluster.
+
+    tbb::parallel_for(from, std::min(to, _graph->n()), [&](const NodeID u) {
+      if (should_stop()) {
+        return;
+      }
+
+      // Skip nodes not considered for two-hop clustering
+      if (!is_considered_for_two_hop_clustering(u)) {
+        return;
+      }
+
+      // Invariant:
+      // For each node u that is considered for two-hop clustering (i.e., nodes for which the
+      // following lines of code are executed), _favored_clusters[u] refers to node which *IS NOT*
+      // considered for two-hop matching.
+      //
+      // Reasoning:
+      // KASSERT()
+      //
+      // Conclusion:
+      // We can use _favored_clusters[u] to build the two-hop clusters.
+
+      const NodeID C = _favored_clusters[u];
+      auto &sync = _favored_clusters[C];
+
+      do {
+        NodeID cluster = sync;
+
+        if (cluster == C) {
+          if (sync.compare_exchange_strong(cluster, u)) {
+            // We are done: other nodes will join our cluster
+            break;
+          }
+          if (cluster == C) {
+            continue;
+          }
+        }
+
+        // Invariant: cluster is a node with favored cluster C
+        KASSERT(
+            _favored_clusters[cluster] == C,
+            "invariant violated by: " << V(u) << V(cluster) << V(C) << V(_favored_clusters[C])
+        );
+
+        // Try to join the cluster:
+        if constexpr (match) {
+          // Matching mode: try to build a cluster only containing nodes "cluster" and "u"
+          if (sync.compare_exchange_strong(cluster, C)) {
+            [[maybe_unused]] const bool success = derived_move_cluster_weight(
+                u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster)
+            );
+            KASSERT(
+                success,
+                "node " << u << " could be matched with node " << cluster << ": "
+                        << derived_cluster_weight(u) << " + " << derived_cluster_weight(cluster)
+                        << " > " << derived_max_cluster_weight(cluster)
+            );
+
+            derived_move_node(u, cluster);
+
+            // We are done: build a cluster with "cluster", reset "sync" to C
+            break;
+          }
+        } else {
+          // Clustering mode: try to join cluster "cluster" if the weight constraint permits it,
+          // otherwise try to start a new cluster
+          if (derived_move_cluster_weight(
+                  u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster)
+              )) {
+            derived_move_node(u, cluster);
+
+            // We are done: joined cluster "cluster"
+            break;
+          } else if (sync.compare_exchange_strong(cluster, u)) {
+            // We are done: other nodes will join our cluster
+            break;
+          }
+        }
+      } while (true);
+    });
+  }
+
+private:
+  void reset_state() {
+    tbb::parallel_invoke(
+        [&] {
+          tbb::parallel_for<NodeID>(0, _graph->n(), [&](const NodeID u) {
+            if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
+              _active[u] = 1;
+            }
+
+            const ClusterID initial_cluster = derived_initial_cluster(u);
+            derived_init_cluster(u, initial_cluster);
+            if constexpr (Config::kUseTwoHopClustering) {
+              _favored_clusters[u] = initial_cluster;
+            }
+
+            derived_reset_node_state(u);
+          });
+        },
+        [&] {
+          tbb::parallel_for<ClusterID>(0, _initial_num_clusters, [&](const ClusterID cluster) {
+            derived_init_cluster_weight(cluster, derived_initial_cluster_weight(cluster));
+          });
+        }
+    );
+    IFSTATS(_expected_total_gain = 0);
+    _current_num_clusters = _initial_num_clusters;
+  }
+
+private: // CRTP calls
+  //! Return current cluster ID of  node \c u.
+  [[nodiscard]] ClusterID derived_cluster(const NodeID u) {
+    return static_cast<Derived *>(this)->cluster(u);
+  }
+
+  //! Initially place \c u in cluster \cluster.
+  void derived_init_cluster(const NodeID u, const ClusterID cluster) {
+    static_cast<Derived *>(this)->init_cluster(u, cluster);
+  }
+
+  //! Change cluster of node \c u to \c cluster.
+  void derived_move_node(const NodeID u, const ClusterID cluster) {
+    static_cast<Derived *>(this)->move_node(u, cluster);
+  }
+
+  //! Return current weight of cluster \c cluster.
+  [[nodiscard]] ClusterWeight derived_cluster_weight(const ClusterID cluster) {
+    return static_cast<Derived *>(this)->cluster_weight(cluster);
+  }
+
+  //! Initially set weight of cluster \cluster to \c weight.
+  void derived_init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) {
+    static_cast<Derived *>(this)->init_cluster_weight(cluster, weight);
+  }
+
+  //! Attempt to move \c delta weight from cluster \c old_cluster to \c
+  //! new_cluster, which can take at most \c max_weight weight.
+  [[nodiscard]] bool derived_move_cluster_weight(
+      const ClusterID old_cluster,
+      const ClusterID new_cluster,
+      const ClusterWeight delta,
+      const ClusterWeight max_weight
+  ) {
+    return static_cast<Derived *>(this)->move_cluster_weight(
+        old_cluster, new_cluster, delta, max_weight
+    );
+  }
+
+  //! Return the maximum weight of cluster \c cluster.
+  [[nodiscard]] ClusterWeight derived_max_cluster_weight(const ClusterID cluster) {
+    return static_cast<Derived *>(this)->max_cluster_weight(cluster);
+  }
+
+  //! Determine whether a node should be moved to a new cluster.
+  [[nodiscard]] bool derived_accept_cluster(const ClusterSelectionState &state) {
+    return static_cast<Derived *>(this)->accept_cluster(state);
+  }
+
+  void derived_reset_node_state(const NodeID u) {
+    static_cast<Derived *>(this)->reset_node_state(u);
+  }
+
+  [[nodiscard]] inline bool derived_accept_neighbor(const NodeID u, const NodeID v) {
+    return static_cast<Derived *>(this)->accept_neighbor(u, v);
+  }
+
+  [[nodiscard]] inline bool derived_activate_neighbor(const NodeID u) {
+    return static_cast<Derived *>(this)->activate_neighbor(u);
+  }
+
+  [[nodiscard]] ClusterID derived_initial_cluster(const NodeID u) {
+    return static_cast<Derived *>(this)->initial_cluster(u);
+  }
+
+  [[nodiscard]] ClusterWeight derived_initial_cluster_weight(const ClusterID cluster) {
+    return static_cast<Derived *>(this)->initial_cluster_weight(cluster);
+  }
+
+  [[nodiscard]] bool derived_skip_node(const NodeID node) {
+    return static_cast<Derived *>(this)->skip_node(node);
+  }
+
+protected: // Default implementations
+  void reset_node_state(const NodeID /* node */) {}
+
+  [[nodiscard]] inline bool accept_neighbor(const NodeID /* u */, const NodeID /* v */) {
+    return true;
+  }
+
+  [[nodiscard]] inline bool activate_neighbor(const NodeID /* node */) {
+    return true;
+  }
+
+  [[nodiscard]] inline ClusterID initial_cluster(const NodeID u) {
+    return derived_cluster(u);
+  }
+
+  [[nodiscard]] inline ClusterWeight initial_cluster_weight(const ClusterID cluster) {
+    return derived_cluster_weight(cluster);
+  }
+
+  [[nodiscard]] inline bool skip_node(const NodeID /* node */) {
+    return false;
+  }
+
+protected: // Members
+  //! Graph we operate on, or \c nullptr if \c initialize has not been called
+  //! yet.
+  const Graph *_graph{nullptr};
+
+  //! The number of non-empty clusters before we ran the first iteration of
+  //! label propagation.
+  ClusterID _initial_num_clusters;
+
+  //! The current number of non-empty clusters. Only meaningful if empty
+  //! clusters are being counted.
+  parallel::Atomic<ClusterID> _current_num_clusters;
+
+  //! We stop label propagation if the number of non-empty clusters falls below
+  //! this threshold. Only has an effect if empty clusters are being counted.
+  ClusterID _desired_num_clusters = 0;
+
+  //! We do not move nodes with a degree higher than this. However, other nodes
+  //! may still be moved to the cluster of with degree larger than this
+  //! threshold.
+  NodeID _max_degree = std::numeric_limits<NodeID>::max();
+
+  //! When computing the gain values for a node, this is an upper limit on the
+  //! number of neighbors of the nodes we consider. Any more neighbors are
+  //! ignored.
+  NodeID _max_num_neighbors = std::numeric_limits<NodeID>::max();
+
+  //! Thread-local map to compute gain values.
+  tbb::enumerable_thread_specific<RatingMap> _rating_map_ets{[this] {
+    return RatingMap(_num_clusters);
+  }};
+
+  //! Flags nodes with at least one node in its neighborhood that changed
+  //! clusters during the last iteration. Nodes without this flag set must not
+  //! be considered in the next iteration.
+  scalable_vector<parallel::Atomic<uint8_t>> _active;
+
+  //! If a node cannot join any cluster during an iteration, this vector stores
+  //! the node's highest rated cluster independent of the maximum cluster
+  //! weight. This information is used during 2-hop clustering.
+  scalable_vector<parallel::Atomic<ClusterID>> _favored_clusters;
+
+  //! If statistics are enabled, this is the sum of the gain of all moves that
+  //! were performed. If executed single-thread, this should be equal to the
+  //! reduction of the edge cut.
+  parallel::Atomic<EdgeWeight> _expected_total_gain;
+
+private:
+  NodeID _num_nodes = 0;
+  NodeID _num_active_nodes = 0;
+  ClusterID _num_clusters = 0;
+};
+
+/*!
+ * Parallel label propagation template that iterates over nodes in their natural
+ * order.
+ * @tparam Derived Derived subclass for static polymorphism.
+ * @tparam Config Algorithmic configuration and data types.
+ */
+template <typename Derived, typename Config>
+class InOrderLabelPropagation : public LabelPropagation<Derived, Config> {
+  static_assert(std::is_base_of_v<LabelPropagationConfig, Config>);
+  SET_DEBUG(true);
+
+protected:
+  using Base = LabelPropagation<Derived, Config>;
+
+  using Graph = typename Base::Graph;
+  using ClusterID = typename Base::ClusterID;
+  using ClusterWeight = typename Base::ClusterWeight;
+  using EdgeID = typename Base::EdgeID;
+  using EdgeWeight = typename Base::EdgeWeight;
+  using NodeID = typename Base::NodeID;
+  using NodeWeight = typename Base::NodeWeight;
+
+  using Base::handle_node;
+  using Base::set_max_degree;
+  using Base::set_max_num_neighbors;
+  using Base::should_stop;
+
+  NodeID
+  perform_iteration(const NodeID from = 0, const NodeID to = std::numeric_limits<NodeID>::max()) {
+    tbb::enumerable_thread_specific<NodeID> num_moved_nodes_ets;
+
+    tbb::parallel_for(
+        tbb::blocked_range<NodeID>(from, std::min(_graph->n(), to)),
+        [&](const auto &r) {
+          EdgeID work_since_update = 0;
+          NodeID num_removed_clusters = 0;
+
+          auto &num_moved_nodes = num_moved_nodes_ets.local();
+          auto &rand = Random::instance();
+          auto &rating_map = _rating_map_ets.local();
+
+          for (NodeID u = r.begin(); u != r.end(); ++u) {
+            if (_graph->degree(u) > _max_degree) {
+              continue;
+            }
+
+            if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
+              if (!_active[u].load(std::memory_order_relaxed)) {
+                continue;
+              }
+            }
+
+            if (work_since_update > Config::kMinChunkSize) {
+              if (Base::should_stop()) {
+                return;
+              }
+
+              _current_num_clusters -= num_removed_clusters;
+              work_since_update = 0;
+              num_removed_clusters = 0;
+            }
+
+            const auto [moved_node, emptied_cluster] = handle_node(u, rand, rating_map);
+            work_since_update += _graph->degree(u);
+            if (moved_node) {
+              ++num_moved_nodes;
+            }
+            if (emptied_cluster) {
+              ++num_removed_clusters;
+            }
+          }
+        }
+    );
+
+    return num_moved_nodes_ets.combine(std::plus{});
+  }
+
+  using Base::_active;
+  using Base::_current_num_clusters;
+  using Base::_graph;
+  using Base::_max_degree;
+  using Base::_rating_map_ets;
+};
+
+/*!
+ * Parallel label propagation template that iterates over nodes in chunk random
+ * order.
+ * @tparam Derived Derived subclass for static polymorphism.
+ * @tparam Config Algorithmic configuration and data types.
+ */
+template <typename Derived, typename Config>
+class ChunkRandomdLabelPropagation : public LabelPropagation<Derived, Config> {
+  using Base = LabelPropagation<Derived, Config>;
+  static_assert(std::is_base_of_v<LabelPropagationConfig, Config>);
+
+  SET_DEBUG(false);
+
+protected:
+  using Graph = typename Base::Graph;
+  using ClusterID = typename Base::ClusterID;
+  using ClusterWeight = typename Base::ClusterWeight;
+  using EdgeID = typename Base::EdgeID;
+  using EdgeWeight = typename Base::EdgeWeight;
+  using NodeID = typename Base::NodeID;
+  using NodeWeight = typename Base::NodeWeight;
+
+  using Base::handle_node;
+  using Base::set_max_degree;
+  using Base::set_max_num_neighbors;
+  using Base::should_stop;
+
+  void initialize(const Graph *graph, const ClusterID num_clusters) {
+    Base::initialize(graph, num_clusters);
+    _chunks.clear();
+    _buckets.clear();
+  }
+
+  /**
+   * Performs label propagation on local nodes in range [from, to) in
+   * chunk-randomized order.
+   *
+   * The randomization works in multiple steps:
+   * - Nodes within the iteration order are split into chunks of consecutive
+   * nodes. The size of each chunk is determined by
+   * LabelPropagationConfig::kMinChunkSize, which is a lower bound on the sum of
+   * the degrees assigned to a chunk (nodes are assigned to a chunk until the
+   * limit is exceeded).
+   * - Afterwards, the order of chunk is shuffled.
+   * - Finally, chunks are processed in parallel. To this end, the nodes
+   * assigned to a chunk are once more split into sub-chunks, which are then
+   * processed sequentially and in-order; however, within a sub-chunk, nodes are
+   * once more shuffled.
+   * - If available, degree buckets are respected: chunks of smaller buckets are
+   * processed before chunks of larger buckets.
+   *
+   * @param from First node in the iteration range.
+   * @param to First node that is not part of the iteration range.
+   * @return Number of nodes that where moved to new blocks / clusters.
+   */
+  NodeID
+  perform_iteration(const NodeID from = 0, const NodeID to = std::numeric_limits<NodeID>::max()) {
+    if (from != 0 || to != std::numeric_limits<NodeID>::max()) {
+      _chunks.clear();
+    }
+    if (_chunks.empty()) {
+      init_chunks(from, to);
+    }
+    shuffle_chunks();
+
+    tbb::enumerable_thread_specific<NodeID> num_moved_nodes_ets;
+    parallel::Atomic<std::size_t> next_chunk = 0;
+
+    tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](const std::size_t) {
+      if (should_stop()) {
+        return;
+      }
+
+      auto &local_num_moved_nodes = num_moved_nodes_ets.local();
+      auto &local_rand = Random::instance();
+      auto &local_rating_map = _rating_map_ets.local();
+      NodeID num_removed_clusters = 0;
+
+      const auto chunk_id = next_chunk.fetch_add(1, std::memory_order_relaxed);
+      const auto &chunk = _chunks[chunk_id];
+      const auto &permutation = _random_permutations.get(local_rand);
+
+      const std::size_t num_sub_chunks =
+          std::ceil(1.0 * (chunk.end - chunk.start) / Config::kPermutationSize);
+      std::vector<NodeID> sub_chunk_permutation(num_sub_chunks);
+      std::iota(sub_chunk_permutation.begin(), sub_chunk_permutation.end(), 0);
+      local_rand.shuffle(sub_chunk_permutation);
+
+      for (std::size_t sub_chunk = 0; sub_chunk < num_sub_chunks; ++sub_chunk) {
+        for (std::size_t i = 0; i < Config::kPermutationSize; ++i) {
+          const NodeID u = chunk.start +
+                           Config::kPermutationSize * sub_chunk_permutation[sub_chunk] +
+                           permutation[i % Config::kPermutationSize];
+          if (u < chunk.end && _graph->degree(u) < _max_degree &&
+              ((!Config::kUseActiveSetStrategy && !Config::kUseLocalActiveSetStrategy) ||
+               _active[u].load(std::memory_order_relaxed))) {
+            const auto [moved_node, emptied_cluster] = handle_node(u, local_rand, local_rating_map);
+            if (moved_node) {
+              ++local_num_moved_nodes;
+            }
+            if (emptied_cluster) {
+              ++num_removed_clusters;
+            }
+          }
+        }
+      }
+
+      _current_num_clusters -= num_removed_clusters;
+    });
+
+    return num_moved_nodes_ets.combine(std::plus{});
+  }
+
+private:
+  struct Chunk {
+    NodeID start;
+    NodeID end;
+  };
+
+  struct Bucket {
+    std::size_t start;
+    std::size_t end;
+  };
+
+  void shuffle_chunks() {
+    tbb::parallel_for<std::size_t>(0, _buckets.size(), [&](const std::size_t i) {
+      const auto &bucket = _buckets[i];
+      Random::instance().shuffle(_chunks.begin() + bucket.start, _chunks.begin() + bucket.end);
+    });
+  }
+
+  void init_chunks(const NodeID from, NodeID to) {
+    _chunks.clear();
+    _buckets.clear();
+
+    to = std::min(to, _graph->n());
+
+    const auto max_bucket =
+        std::min<std::size_t>(math::floor_log2(_max_degree), _graph->number_of_buckets());
+    const EdgeID max_chunk_size = std::max<EdgeID>(Config::kMinChunkSize, std::sqrt(_graph->m()));
+    const NodeID max_node_chunk_size =
+        std::max<NodeID>(Config::kMinChunkSize, std::sqrt(_graph->n()));
+
+    NodeID position = 0;
+    for (std::size_t bucket = 0; bucket < max_bucket; ++bucket) {
+      if (position + _graph->bucket_size(bucket) < from || _graph->bucket_size(bucket) == 0) {
+        position += _graph->bucket_size(bucket);
+        continue;
+      }
+      if (position >= to) {
+        break;
+      }
+
+      NodeID remaining_bucket_size = _graph->bucket_size(bucket);
+      if (from > _graph->first_node_in_bucket(bucket)) {
+        remaining_bucket_size -= from - _graph->first_node_in_bucket(bucket);
+      }
+      const std::size_t bucket_size =
+          std::min<NodeID>({remaining_bucket_size, to - position, to - from});
+
+      parallel::Atomic<NodeID> offset = 0;
+      tbb::enumerable_thread_specific<std::size_t> num_chunks_ets;
+      tbb::enumerable_thread_specific<std::vector<Chunk>> chunks_ets;
+
+      const std::size_t bucket_start = std::max(_graph->first_node_in_bucket(bucket), from);
+
+      tbb::parallel_for(
+          static_cast<int>(0),
+          tbb::this_task_arena::max_concurrency(),
+          [&](const int) {
+            auto &chunks = chunks_ets.local();
+            auto &num_chunks = num_chunks_ets.local();
+
+            while (offset < bucket_size) {
+              const NodeID begin = offset.fetch_add(max_node_chunk_size);
+              if (begin >= bucket_size) {
+                break;
+              }
+              const NodeID end = std::min<NodeID>(begin + max_node_chunk_size, bucket_size);
+
+              EdgeID current_chunk_size = 0;
+              NodeID chunk_start = bucket_start + begin;
+
+              for (NodeID i = begin; i < end; ++i) {
+                const NodeID u = bucket_start + i;
+                current_chunk_size += _graph->degree(u);
+                if (current_chunk_size >= max_chunk_size) {
+                  chunks.push_back({chunk_start, u + 1});
+                  chunk_start = u + 1;
+                  current_chunk_size = 0;
+                  ++num_chunks;
+                }
+              }
+
+              if (current_chunk_size > 0) {
+                chunks.push_back(
+                    {static_cast<NodeID>(chunk_start), static_cast<NodeID>(bucket_start + end)}
+                );
+                ++num_chunks;
+              }
+            }
+          }
+      );
+
+      const std::size_t num_chunks = num_chunks_ets.combine(std::plus{});
+
+      const std::size_t chunks_start = _chunks.size();
+      parallel::Atomic<std::size_t> pos = chunks_start;
+      _chunks.resize(chunks_start + num_chunks);
+      tbb::parallel_for(chunks_ets.range(), [&](auto &r) {
+        for (auto &chunk : r) {
+          const std::size_t local_pos = pos.fetch_add(chunk.size());
+          std::copy(chunk.begin(), chunk.end(), _chunks.begin() + local_pos);
+        }
+      });
+
+      _buckets.push_back({chunks_start, _chunks.size()});
+
+      position += _graph->bucket_size(bucket);
+    }
+
+    // Make sure that we cover all nodes in [from, to)
+    KASSERT(
+        [&] {
+          std::vector<bool> hit(to - from);
+          for (const auto &[start, end] : _chunks) {
+            KASSERT(start <= end, "");
+            EdgeWeight total_work = 0;
+
+            for (NodeID u = start; u < end; ++u) {
+              KASSERT(from <= u, "");
+              KASSERT(u < to, "");
+              KASSERT(!hit[u - from], "");
+
+              hit[u - from] = true;
+              total_work += _graph->degree(u);
+            }
+          }
+
+          for (NodeID u = 0; u < to - from; ++u) {
+            KASSERT(
+                _graph->degree(u) == 0u || hit[u],
+                V(_graph->degree(u)) << V(from) << V(u + from) << V(to)
+            );
+          }
+
+          return true;
+        }(),
+        "",
+        assert::heavy
+    );
+  }
+
+protected:
+  using Base::_active;
+  using Base::_current_num_clusters;
+  using Base::_graph;
+  using Base::_max_degree;
+  using Base::_rating_map_ets;
+
+  RandomPermutations<NodeID, Config::kPermutationSize, Config::kNumberOfNodePermutations>
+      _random_permutations{};
+  std::vector<Chunk> _chunks;
+  std::vector<Bucket> _buckets;
+};
+
+template <typename NodeID, typename ClusterID> class NonatomicOwnedClusterVector {
+public:
+  explicit NonatomicOwnedClusterVector(const NodeID max_num_nodes) : _clusters(max_num_nodes) {
+    tbb::parallel_for<NodeID>(0, max_num_nodes, [&](const NodeID u) { _clusters[u] = 0; });
+  }
+
+  [[nodiscard]] auto &&take_clusters() {
+    return std::move(_clusters);
+  }
+
+  [[nodiscard]] auto &clusters() {
+    return _clusters;
+  }
+
+  void init_cluster(const NodeID node, const ClusterID cluster) {
+    move_node(node, cluster);
+  }
+
+  [[nodiscard]] ClusterID cluster(const NodeID node) {
+    KASSERT(node < _clusters.size());
+    return __atomic_load_n(&_clusters[node], __ATOMIC_RELAXED);
+  }
+
+  void move_node(const NodeID node, const ClusterID cluster) {
+    KASSERT(node < _clusters.size());
+    __atomic_store_n(&_clusters[node], cluster, __ATOMIC_RELAXED);
+  }
+
+  void ensure_cluster_size(const NodeID max_num_nodes) {
+    if (_clusters.size() < max_num_nodes) {
+      _clusters.resize(max_num_nodes);
+    }
+  }
+
+private:
+  NoinitVector<ClusterID> _clusters;
+};
+
+template <typename NodeID, typename ClusterID> class OwnedClusterVector {
+public:
+  explicit OwnedClusterVector(const NodeID max_num_nodes) : _clusters(max_num_nodes) {}
+
+  [[nodiscard]] auto &&take_clusters() {
+    return std::move(_clusters);
+  }
+
+  [[nodiscard]] auto &clusters() {
+    return _clusters;
+  }
+
+  void init_cluster(const NodeID node, const ClusterID cluster) {
+    _clusters[node] = cluster;
+  }
+
+  [[nodiscard]] ClusterID cluster(const NodeID node) {
+    KASSERT(node < _clusters.size());
+    return _clusters[node];
+  }
+
+  void move_node(const NodeID node, const ClusterID cluster) {
+    KASSERT(node < _clusters.size());
+    _clusters[node] = cluster;
+  }
+
+  void ensure_cluster_size(const NodeID max_num_nodes) {
+    if (_clusters.size() < max_num_nodes) {
+      _clusters.resize(max_num_nodes);
+    }
+  }
+
+private:
+  scalable_vector<parallel::Atomic<ClusterID>> _clusters;
+};
+
+template <typename ClusterID, typename ClusterWeight> class OwnedRelaxedClusterWeightVector {
+public:
+  explicit OwnedRelaxedClusterWeightVector(const ClusterID max_num_clusters)
+      : _cluster_weights(max_num_clusters) {}
+
+  auto &&take_cluster_weights() {
+    return std::move(_cluster_weights);
+  }
+
+  void init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) {
+    _cluster_weights[cluster] = weight;
+  }
+
+  ClusterWeight cluster_weight(const ClusterID cluster) {
+    return _cluster_weights[cluster];
+  }
+
+  bool move_cluster_weight(
+      const ClusterID old_cluster,
+      const ClusterID new_cluster,
+      const ClusterWeight delta,
+      const ClusterWeight max_weight
+  ) {
+    if (_cluster_weights[new_cluster] + delta <= max_weight) {
+      _cluster_weights[new_cluster].fetch_add(delta, std::memory_order_relaxed);
+      _cluster_weights[old_cluster].fetch_sub(delta, std::memory_order_relaxed);
+      return true;
+    }
+    return false;
+  }
+
+private:
+  scalable_vector<parallel::Atomic<ClusterWeight>> _cluster_weights;
+};
+} // namespace kaminpar::dist
diff --git a/kaminpar-dist/dkaminpar.cc b/kaminpar-dist/dkaminpar.cc
index be5a6ebc..5e180055 100644
--- a/kaminpar-dist/dkaminpar.cc
+++ b/kaminpar-dist/dkaminpar.cc
@@ -99,12 +99,9 @@ void print_input_summary(
   if (root && parseable) {
     LOG << "EXECUTION_MODE num_mpis=" << ctx.parallel.num_mpis
         << " num_threads=" << ctx.parallel.num_threads;
-    LOG << "INPUT_GRAPH "
-        << "global_n=" << graph.global_n() << " "
-        << "global_m=" << graph.global_m() << " "
-        << "n=[" << n_str << "] "
-        << "m=[" << m_str << "] "
-        << "ghost_n=[" << ghost_n_str << "]";
+    LOG << "INPUT_GRAPH " << "global_n=" << graph.global_n() << " "
+        << "global_m=" << graph.global_m() << " " << "n=[" << n_str << "] " << "m=[" << m_str
+        << "] " << "ghost_n=[" << ghost_n_str << "]";
   }
 
   // Output
@@ -269,7 +266,9 @@ GlobalEdgeWeight dKaMinPar::compute_partition(const BlockID k, BlockID *partitio
   // level?
   // The binary interface already implements graph validation via KaGen, which can be enabled as a
   // CLI flag. There is no such option when using the library interface.
-  KASSERT(debug::validate_graph(graph), "input graph failed graph verification", assert::heavy);
+  KASSERT(
+      dist::debug::validate_graph(graph), "input graph failed graph verification", assert::heavy
+  );
 
   // Setup the remaining context options that are passed in via the constructor
   _ctx.parallel.num_mpis = size;
@@ -293,7 +292,7 @@ GlobalEdgeWeight dKaMinPar::compute_partition(const BlockID k, BlockID *partitio
   STOP_TIMER();
 
   KASSERT(
-      debug::validate_partition(p_graph),
+      dist::debug::validate_partition(p_graph),
       "graph partition verification failed after partitioning",
       assert::heavy
   );
diff --git a/kaminpar-dist/graphutils/bfs_extractor.cc b/kaminpar-dist/graphutils/bfs_extractor.cc
index c94bb953..bbcd5013 100644
--- a/kaminpar-dist/graphutils/bfs_extractor.cc
+++ b/kaminpar-dist/graphutils/bfs_extractor.cc
@@ -27,7 +27,6 @@
 #include "kaminpar-common/assert.h"
 #include "kaminpar-common/datastructures/marker.h"
 #include "kaminpar-common/datastructures/static_array.h"
-#include "kaminpar-common/random.h"
 #include "kaminpar-common/timer.h"
 
 namespace kaminpar::dist::graph {
@@ -222,7 +221,8 @@ auto BfsExtractor::exchange_explored_subgraphs(
         std::move(node_weights_recvbufs[pe]),
         std::move(edge_weights_recvbufs[pe]),
         std::move(node_mapping_recvbufs[pe]),
-        std::move(partition_recvbufs[pe])};
+        std::move(partition_recvbufs[pe])
+    };
   });
 
   return fragments;
@@ -552,9 +552,9 @@ auto BfsExtractor::combine_fragments(tbb::concurrent_vector<GraphFragment> &frag
   });
 
   // Construct shared-memory graph
-  auto graph = std::make_unique<shm::Graph>(
+  auto graph = std::make_unique<shm::Graph>(std::make_unique<shm::CSRGraph>(
       std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights)
-  );
+  ));
   auto p_graph =
       std::make_unique<shm::PartitionedGraph>(*graph, _p_graph->k(), std::move(partition));
 
diff --git a/kaminpar-dist/graphutils/rearrangement.h b/kaminpar-dist/graphutils/rearrangement.h
index 55047bae..8b3f7420 100644
--- a/kaminpar-dist/graphutils/rearrangement.h
+++ b/kaminpar-dist/graphutils/rearrangement.h
@@ -7,10 +7,10 @@
  ******************************************************************************/
 #pragma once
 
-#include "kaminpar-dist/context.h"
 #include "kaminpar-dist/datastructures/distributed_graph.h"
+#include "kaminpar-dist/dkaminpar.h"
 
-#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/datastructures/static_array.h"
 
 namespace kaminpar::dist::graph {
 DistributedGraph rearrange(DistributedGraph graph, const Context &ctx);
diff --git a/kaminpar-dist/graphutils/replicator.cc b/kaminpar-dist/graphutils/replicator.cc
index 3b1249f7..988377f9 100644
--- a/kaminpar-dist/graphutils/replicator.cc
+++ b/kaminpar-dist/graphutils/replicator.cc
@@ -24,7 +24,6 @@
 #include "kaminpar-shm/metrics.h"
 
 #include "kaminpar-common/datastructures/static_array.h"
-#include "kaminpar-common/parallel/atomic.h"
 
 namespace kaminpar::dist {
 SET_DEBUG(false);
@@ -188,7 +187,9 @@ shm::Graph replicate_graph_everywhere(const DistributedGraph &graph) {
     }
   });
 
-  return {std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights)};
+  return {std::make_unique<shm::CSRGraph>(
+      std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights)
+  )};
 }
 
 DistributedGraph replicate_graph(const DistributedGraph &graph, const int num_replications) {
diff --git a/kaminpar-dist/graphutils/subgraph_extractor.cc b/kaminpar-dist/graphutils/subgraph_extractor.cc
index 4480026c..fba94b74 100644
--- a/kaminpar-dist/graphutils/subgraph_extractor.cc
+++ b/kaminpar-dist/graphutils/subgraph_extractor.cc
@@ -27,6 +27,7 @@
 #include "kaminpar-common/datastructures/static_array.h"
 #include "kaminpar-common/math.h"
 #include "kaminpar-common/parallel/algorithm.h"
+#include "kaminpar-common/parallel/atomic.h"
 #include "kaminpar-common/parallel/vector_ets.h"
 
 namespace kaminpar::dist::graph {
@@ -541,13 +542,13 @@ std::pair<std::vector<shm::Graph>, std::vector<std::vector<NodeID>>> construct_s
     }
     subgraphs_offsets[b].push_back(pos_n);
 
-    subgraphs[b] = shm::Graph(
+    subgraphs[b] = shm::Graph(std::make_unique<shm::CSRGraph>(
         std::move(subgraph_nodes),
         std::move(subgraph_edges),
         std::move(subgraph_node_weights),
         std::move(subgraph_edge_weights),
         false
-    );
+    ));
   });
 
   return {std::move(subgraphs), std::move(subgraphs_offsets)};
@@ -607,7 +608,8 @@ extract_and_scatter_block_induced_subgraphs(const DistributedPartitionedGraph &p
   return {
       std::move(gathered_subgraphs),
       std::move(offsets),
-      std::move(extracted_local_subgraphs.mapping)};
+      std::move(extracted_local_subgraphs.mapping)
+  };
 }
 
 DistributedPartitionedGraph copy_subgraph_partitions(
@@ -687,9 +689,7 @@ DistributedPartitionedGraph copy_subgraph_partitions(
   synchronize_ghost_node_block_ids(new_p_graph);
 
   KASSERT(
-      debug::validate_partition(new_p_graph),
-      "graph partition in inconsistent state",
-      assert::heavy
+      debug::validate_partition(new_p_graph), "graph partition in inconsistent state", assert::heavy
   );
   return new_p_graph;
 }
@@ -788,9 +788,7 @@ DistributedPartitionedGraph copy_duplicated_subgraph_partitions(
   synchronize_ghost_node_block_ids(new_p_graph);
 
   KASSERT(
-      debug::validate_partition(new_p_graph),
-      "graph partition in inconsistent state",
-      assert::heavy
+      debug::validate_partition(new_p_graph), "graph partition in inconsistent state", assert::heavy
   );
   return new_p_graph;
 }
diff --git a/kaminpar-dist/refinement/lp/clp_refiner.cc b/kaminpar-dist/refinement/lp/clp_refiner.cc
index c8d323fd..0ef29a2f 100644
--- a/kaminpar-dist/refinement/lp/clp_refiner.cc
+++ b/kaminpar-dist/refinement/lp/clp_refiner.cc
@@ -21,11 +21,11 @@
 #include "kaminpar-dist/datastructures/distributed_graph.h"
 #include "kaminpar-dist/datastructures/distributed_partitioned_graph.h"
 #include "kaminpar-dist/graphutils/communication.h"
-#include "kaminpar-dist/metrics.h"
 
 #include "kaminpar-common/assert.h"
 #include "kaminpar-common/datastructures/rating_map.h"
 #include "kaminpar-common/parallel/algorithm.h"
+#include "kaminpar-common/parallel/atomic.h"
 #include "kaminpar-common/parallel/vector_ets.h"
 #include "kaminpar-common/random.h"
 #include "kaminpar-common/timer.h"
@@ -382,8 +382,8 @@ NodeID ColoredLPRefiner::perform_best_moves(const ColorID c) {
   return num_local_moved_nodes;
 }
 
-auto ColoredLPRefiner::reduce_move_candidates(std::vector<MoveCandidate> &&candidates)
-    -> std::vector<MoveCandidate> {
+auto ColoredLPRefiner::reduce_move_candidates(std::vector<MoveCandidate> &&candidates
+) -> std::vector<MoveCandidate> {
   const int size = mpi::get_comm_size(_p_graph.communicator());
   const int rank = mpi::get_comm_rank(_p_graph.communicator());
   KASSERT(math::is_power_of_2(size), "#PE must be a power of two", assert::always);
@@ -891,7 +891,7 @@ void ColoredLPRefiner::GainStatistics::record_gain(const EdgeWeight gain, const
 }
 
 void ColoredLPRefiner::GainStatistics::summarize_by_size(
-    const NoinitVector<NodeID> &color_sizes, MPI_Comm comm
+    const NoinitVector<ColorID> &color_sizes, MPI_Comm comm
 ) const {
   KASSERT(!_gain_per_color.empty(), "must call initialize() first");
   KASSERT(_gain_per_color.size() <= color_sizes.size());
diff --git a/kaminpar-dist/refinement/lp/clp_refiner.h b/kaminpar-dist/refinement/lp/clp_refiner.h
index e23762cf..69d29dd8 100644
--- a/kaminpar-dist/refinement/lp/clp_refiner.h
+++ b/kaminpar-dist/refinement/lp/clp_refiner.h
@@ -53,7 +53,7 @@ class ColoredLPRefiner : public GlobalRefiner {
   public:
     void initialize(ColorID num_colors);
     void record_gain(EdgeWeight gain, ColorID c);
-    void summarize_by_size(const NoinitVector<NodeID> &color_sizes, MPI_Comm comm) const;
+    void summarize_by_size(const NoinitVector<ColorID> &color_sizes, MPI_Comm comm) const;
 
   private:
     std::vector<EdgeWeight> _gain_per_color;
diff --git a/kaminpar-dist/refinement/lp/lp_refiner.cc b/kaminpar-dist/refinement/lp/lp_refiner.cc
index 324fa37a..f1f69726 100644
--- a/kaminpar-dist/refinement/lp/lp_refiner.cc
+++ b/kaminpar-dist/refinement/lp/lp_refiner.cc
@@ -14,12 +14,10 @@
 
 #include "kaminpar-dist/datastructures/distributed_graph.h"
 #include "kaminpar-dist/datastructures/distributed_partitioned_graph.h"
+#include "kaminpar-dist/distributed_label_propagation.h"
 #include "kaminpar-dist/graphutils/communication.h"
 #include "kaminpar-dist/metrics.h"
 
-#include "kaminpar-shm/label_propagation.h"
-
-#include "kaminpar-common/datastructures/marker.h"
 #include "kaminpar-common/datastructures/rating_map.h"
 #include "kaminpar-common/math.h"
 #include "kaminpar-common/parallel/vector_ets.h"
@@ -41,7 +39,7 @@ struct LPRefinerConfig : public LabelPropagationConfig {
 };
 
 class LPRefinerImpl final : public ChunkRandomdLabelPropagation<LPRefinerImpl, LPRefinerConfig> {
-  SET_STATISTICS(false);
+  SET_STATISTICS_FROM_GLOBAL();
   SET_DEBUG(false);
 
   using Base = ChunkRandomdLabelPropagation<LPRefinerImpl, LPRefinerConfig>;
diff --git a/kaminpar-shm/coarsening/cluster_coarsener.cc b/kaminpar-shm/coarsening/cluster_coarsener.cc
index 7d1f70b5..d512568c 100644
--- a/kaminpar-shm/coarsening/cluster_coarsener.cc
+++ b/kaminpar-shm/coarsening/cluster_coarsener.cc
@@ -7,59 +7,112 @@
  ******************************************************************************/
 #include "kaminpar-shm/coarsening/cluster_coarsener.h"
 
-#include "kaminpar-common/logger.h"
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/coarsening/max_cluster_weights.h"
+#include "kaminpar-shm/factories.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/assert.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/timer.h"
 
 namespace kaminpar::shm {
-std::pair<const Graph *, bool> ClusteringCoarsener::compute_coarse_graph(
-    const NodeWeight max_cluster_weight, const NodeID to_size
-) {
+ClusteringCoarsener::ClusteringCoarsener(const Context &ctx, const PartitionContext &p_ctx)
+    : _clustering_algorithm(factory::create_clusterer(ctx)),
+      _c_ctx(ctx.coarsening),
+      _p_ctx(p_ctx) {}
+
+void ClusteringCoarsener::initialize(const Graph *graph) {
+  _hierarchy.clear();
+  _input_graph = graph;
+}
+
+bool ClusteringCoarsener::coarsen() {
+  SCOPED_HEAP_PROFILER("Level", std::to_string(_hierarchy.size()));
   SCOPED_TIMER("Level", std::to_string(_hierarchy.size()));
 
-  _clustering_algorithm->set_max_cluster_weight(max_cluster_weight);
-  _clustering_algorithm->set_desired_cluster_count(to_size);
+  if (_clustering.size() < current().n()) {
+    SCOPED_HEAP_PROFILER("Allocation");
+    SCOPED_TIMER("Allocation");
+    _clustering.resize(current().n());
+  }
 
-  const auto &clustering = TIMED_SCOPE("Label Propagation") {
-    return _clustering_algorithm->compute_clustering(*_current_graph);
-  };
+  const bool free_allocated_memory = !keep_allocated_memory();
+  const NodeWeight total_node_weight = current().total_node_weight();
+  const NodeID prev_n = current().n();
+
+  START_HEAP_PROFILER("Label Propagation");
+  START_TIMER("Label Propagation");
+  _clustering_algorithm->set_max_cluster_weight(
+      compute_max_cluster_weight<NodeWeight>(_c_ctx, _p_ctx, prev_n, total_node_weight)
+  );
+  _clustering_algorithm->set_desired_cluster_count(0);
+  _clustering_algorithm->compute_clustering(_clustering, current(), free_allocated_memory);
+  STOP_TIMER();
+  STOP_HEAP_PROFILER();
 
-  auto [c_graph, c_mapping, m_ctx] = TIMED_SCOPE("Contract graph") {
-    return graph::contract(*_current_graph, clustering, std::move(_contraction_m_ctx));
+  START_HEAP_PROFILER("Contract graph");
+  auto coarsened = TIMED_SCOPE("Contract graph") {
+    return contract_clustering(current(), _clustering, _c_ctx.contraction, _contraction_m_ctx);
   };
-  _contraction_m_ctx = std::move(m_ctx);
+  _hierarchy.push_back(std::move(coarsened));
+  STOP_HEAP_PROFILER();
 
-  const bool converged = _c_ctx.coarsening_should_converge(_current_graph->n(), c_graph.n());
+  const NodeID next_n = current().n();
+  const bool converged = (1.0 - 1.0 * next_n / prev_n) <= _c_ctx.convergence_threshold;
 
-  _hierarchy.push_back(std::move(c_graph));
-  _mapping.push_back(std::move(c_mapping));
-  _current_graph = &_hierarchy.back();
+  if (free_allocated_memory) {
+    _contraction_m_ctx.buckets.free();
+    _contraction_m_ctx.buckets_index.free();
+    _contraction_m_ctx.all_buffered_nodes.free();
+  }
 
-  return {_current_graph, !converged};
+  return !converged;
 }
 
 PartitionedGraph ClusteringCoarsener::uncoarsen(PartitionedGraph &&p_graph) {
-  KASSERT(&p_graph.graph() == _current_graph);
-  KASSERT(!empty(), V(size()));
+  SCOPED_HEAP_PROFILER("Level", std::to_string(_hierarchy.size()));
   SCOPED_TIMER("Level", std::to_string(_hierarchy.size()));
 
+  const BlockID p_graph_k = p_graph.k();
+  const auto p_graph_partition = p_graph.take_raw_partition();
+
+  auto coarsened = pop_hierarchy(std::move(p_graph));
+  const NodeID next_n = current().n();
+
+  START_HEAP_PROFILER("Allocation");
   START_TIMER("Allocation");
-  auto mapping{std::move(_mapping.back())};
-  _mapping.pop_back();
-  _hierarchy.pop_back(); // destroys the graph wrapped in p_graph, but partition
-                         // access is still ok
-  _current_graph = empty() ? &_input_graph : &_hierarchy.back();
-  KASSERT(mapping.size() == _current_graph->n(), V(mapping.size()) << V(_current_graph->n()));
-
-  StaticArray<BlockID> partition(_current_graph->n());
+  RECORD("partition") StaticArray<BlockID> partition(next_n);
   STOP_TIMER();
+  STOP_HEAP_PROFILER();
 
-  START_TIMER("Copy partition");
-  tbb::parallel_for(static_cast<NodeID>(0), _current_graph->n(), [&](const NodeID u) {
-    partition[u] = p_graph.block(mapping[u]);
-  });
+  START_TIMER("Project partition");
+  coarsened->project(p_graph_partition, partition);
   STOP_TIMER();
 
+  SCOPED_HEAP_PROFILER("Create graph");
   SCOPED_TIMER("Create graph");
-  return {*_current_graph, p_graph.k(), std::move(partition)};
+  return {current(), p_graph_k, std::move(partition)};
+}
+
+std::unique_ptr<CoarseGraph> ClusteringCoarsener::pop_hierarchy(PartitionedGraph &&p_graph) {
+  KASSERT(!empty(), "cannot pop from an empty graph hierarchy", assert::light);
+
+  auto coarsened = std::move(_hierarchy.back());
+  _hierarchy.pop_back();
+
+  KASSERT(
+      &coarsened->get() == &p_graph.graph(),
+      "p_graph wraps a different graph (ptr="
+          << &p_graph.graph() << ") than the one that was coarsened (ptr=" << &coarsened->get()
+          << ")",
+      assert::light
+  );
+
+  return coarsened;
+}
+
+bool ClusteringCoarsener::keep_allocated_memory() const {
+  return level() >= _c_ctx.clustering.max_mem_free_coarsening_level;
 }
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/cluster_coarsener.h b/kaminpar-shm/coarsening/cluster_coarsener.h
index d79af08e..833cca35 100644
--- a/kaminpar-shm/coarsening/cluster_coarsener.h
+++ b/kaminpar-shm/coarsening/cluster_coarsener.h
@@ -9,24 +9,15 @@
 
 #include "kaminpar-shm/coarsening/clusterer.h"
 #include "kaminpar-shm/coarsening/coarsener.h"
-#include "kaminpar-shm/context.h"
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
-#include "kaminpar-shm/graphutils/cluster_contraction.h"
 #include "kaminpar-shm/kaminpar.h"
 
 namespace kaminpar::shm {
 class ClusteringCoarsener : public Coarsener {
 public:
-  ClusteringCoarsener(
-      std::unique_ptr<Clusterer> clustering_algorithm,
-      const Graph &input_graph,
-      const CoarseningContext &c_ctx
-  )
-      : _input_graph(input_graph),
-        _current_graph(&input_graph),
-        _clustering_algorithm(std::move(clustering_algorithm)),
-        _c_ctx(c_ctx) {}
+  ClusteringCoarsener(const Context &ctx, const PartitionContext &p_ctx);
 
   ClusteringCoarsener(const ClusteringCoarsener &) = delete;
   ClusteringCoarsener &operator=(const ClusteringCoarsener) = delete;
@@ -34,33 +25,33 @@ class ClusteringCoarsener : public Coarsener {
   ClusteringCoarsener(ClusteringCoarsener &&) = delete;
   ClusteringCoarsener &operator=(ClusteringCoarsener &&) = delete;
 
-  std::pair<const Graph *, bool>
-  compute_coarse_graph(NodeWeight max_cluster_weight, NodeID to_size) final;
+  void initialize(const Graph *graph) final;
+
+  bool coarsen() final;
   PartitionedGraph uncoarsen(PartitionedGraph &&p_graph) final;
 
-  [[nodiscard]] const Graph *coarsest_graph() const final {
-    return _current_graph;
+  [[nodiscard]] const Graph &current() const final {
+    return _hierarchy.empty() ? *_input_graph : _hierarchy.back()->get();
   }
 
-  [[nodiscard]] std::size_t size() const final {
+  [[nodiscard]] std::size_t level() const final {
     return _hierarchy.size();
   }
 
-  void initialize(const Graph *) final {}
+private:
+  std::unique_ptr<CoarseGraph> pop_hierarchy(PartitionedGraph &&p_graph);
 
-  [[nodiscard]] const CoarseningContext &context() const {
-    return _c_ctx;
-  }
+  [[nodiscard]] bool keep_allocated_memory() const;
 
-private:
-  const Graph &_input_graph;
-  const Graph *_current_graph;
-  std::vector<Graph> _hierarchy;
-  std::vector<scalable_vector<NodeID>> _mapping;
+  const CoarseningContext &_c_ctx;
+  const PartitionContext &_p_ctx;
+
+  const Graph *_input_graph;
+  std::vector<std::unique_ptr<CoarseGraph>> _hierarchy;
 
+  StaticArray<NodeID> _clustering{};
   std::unique_ptr<Clusterer> _clustering_algorithm;
 
-  const CoarseningContext &_c_ctx;
-  graph::contraction::MemoryContext _contraction_m_ctx{};
+  contraction::MemoryContext _contraction_m_ctx{};
 };
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/clusterer.h b/kaminpar-shm/coarsening/clusterer.h
index da64e4fd..857bc029 100644
--- a/kaminpar-shm/coarsening/clusterer.h
+++ b/kaminpar-shm/coarsening/clusterer.h
@@ -10,14 +10,11 @@
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/kaminpar.h"
 
-#include "kaminpar-common/datastructures/scalable_vector.h"
-#include "kaminpar-common/parallel/atomic.h"
+#include "kaminpar-common/datastructures/static_array.h"
 
 namespace kaminpar::shm {
 class Clusterer {
 public:
-  using AtomicClusterArray = scalable_vector<parallel::Atomic<NodeID>>;
-
   Clusterer() = default;
 
   Clusterer(const Clusterer &) = delete;
@@ -39,6 +36,8 @@ class Clusterer {
   // Clustering function
   //
 
-  virtual const AtomicClusterArray &compute_clustering(const Graph &graph) = 0;
+  virtual void compute_clustering(
+      StaticArray<NodeID> &clustering, const Graph &graph, bool free_memory_afterwards
+  ) = 0;
 };
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/lp_clustering.cc b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.cc
similarity index 67%
rename from kaminpar-shm/coarsening/lp_clustering.cc
rename to kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.cc
index 919e76d1..bcf0915b 100644
--- a/kaminpar-shm/coarsening/lp_clustering.cc
+++ b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.cc
@@ -1,18 +1,16 @@
 /******************************************************************************
  * Label propagation for graph coarsening / clustering.
  *
- * @file:   lp_clustering.cc
+ * @file:   legacy_lp_clusterer.cc
  * @author: Daniel Seemaier
  * @date:   29.09.2021
  ******************************************************************************/
-#include "kaminpar-shm/coarsening/lp_clustering.h"
+#include "kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h"
 
 #include <memory>
 
-#include "kaminpar-shm/coarsening/clusterer.h"
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/datastructures/graph.h"
-#include "kaminpar-shm/label_propagation.h"
+#include "kaminpar-shm/legacy_label_propagation.h"
 
 #include "kaminpar-common/timer.h"
 
@@ -21,42 +19,41 @@ namespace kaminpar::shm {
 // Actual implementation -- not exposed in header
 //
 
-struct LPClusteringConfig : public LabelPropagationConfig {
+struct LegacyLPClusteringConfig : public LegacyLabelPropagationConfig {
   using ClusterID = NodeID;
   using ClusterWeight = BlockWeight;
   static constexpr bool kTrackClusterCount = true;
   static constexpr bool kUseTwoHopClustering = true;
 };
 
-class LPClusteringImpl final
-    : public ChunkRandomdLabelPropagation<LPClusteringImpl, LPClusteringConfig>,
-      public OwnedRelaxedClusterWeightVector<NodeID, NodeWeight>,
-      public OwnedClusterVector<NodeID, NodeID>,
-      public Clusterer {
+class LegacyLPClusteringImpl final
+    : public ChunkRandomdLegacyLabelPropagation<LegacyLPClusteringImpl, LegacyLPClusteringConfig>,
+      public LegacyOwnedRelaxedClusterWeightVector<NodeID, NodeWeight>,
+      public LegacyNonatomicClusterVectorRef<NodeID, NodeID> {
   SET_DEBUG(false);
 
-  using Base = ChunkRandomdLabelPropagation<LPClusteringImpl, LPClusteringConfig>;
-  using ClusterWeightBase = OwnedRelaxedClusterWeightVector<NodeID, NodeWeight>;
-  using ClusterBase = OwnedClusterVector<NodeID, NodeID>;
+  using Base = ChunkRandomdLegacyLabelPropagation<LegacyLPClusteringImpl, LegacyLPClusteringConfig>;
+  using ClusterWeightBase = LegacyOwnedRelaxedClusterWeightVector<NodeID, NodeWeight>;
+  using ClusterBase = LegacyNonatomicClusterVectorRef<NodeID, NodeID>;
 
 public:
-  LPClusteringImpl(const NodeID max_n, const CoarseningContext &c_ctx)
-      : ClusterWeightBase(max_n),
-        ClusterBase(max_n),
-        _c_ctx(c_ctx) {
-    allocate(max_n, max_n);
-    set_max_degree(c_ctx.lp.large_degree_threshold);
-    set_max_num_neighbors(c_ctx.lp.max_num_neighbors);
+  LegacyLPClusteringImpl(const CoarseningContext &c_ctx) : _lp_ctx(c_ctx.clustering.lp) {
+    set_max_degree(_lp_ctx.large_degree_threshold);
+    set_max_num_neighbors(_lp_ctx.max_num_neighbors);
   }
 
-  void set_max_cluster_weight(const NodeWeight max_cluster_weight) final {
+  void set_max_cluster_weight(const NodeWeight max_cluster_weight) {
     _max_cluster_weight = max_cluster_weight;
   }
 
-  const AtomicClusterArray &compute_clustering(const Graph &graph) final {
+  void compute_clustering(StaticArray<NodeID> &clustering, const CSRGraph &graph, bool) {
+    allocate(graph.n(), graph.n());
+    allocate_cluster_weights(graph.n());
+
+    init_clusters_ref(clustering);
     initialize(&graph, graph.n());
 
-    for (int iteration = 0; iteration < _c_ctx.lp.num_iterations; ++iteration) {
+    for (int iteration = 0; iteration < _lp_ctx.num_iterations; ++iteration) {
       SCOPED_TIMER("Iteration", std::to_string(iteration));
       if (perform_iteration() == 0) {
         break;
@@ -65,8 +62,6 @@ class LPClusteringImpl final
 
     cluster_isolated_nodes();
     cluster_two_hop_nodes();
-
-    return clusters();
   }
 
 private:
@@ -77,7 +72,7 @@ class LPClusteringImpl final
       return;
     }
 
-    switch (_c_ctx.lp.two_hop_strategy) {
+    switch (_lp_ctx.two_hop_strategy) {
     case TwoHopStrategy::MATCH:
       match_two_hop_nodes();
       break;
@@ -106,7 +101,7 @@ class LPClusteringImpl final
   void cluster_isolated_nodes() {
     SCOPED_TIMER("Handle isolated nodes");
 
-    switch (_c_ctx.lp.isolated_nodes_strategy) {
+    switch (_lp_ctx.isolated_nodes_strategy) {
     case IsolatedNodesClusteringStrategy::MATCH:
       match_isolated_nodes();
       break;
@@ -133,7 +128,7 @@ class LPClusteringImpl final
   }
 
   [[nodiscard]] bool should_handle_two_hop_nodes() const {
-    return (1.0 - 1.0 * _current_num_clusters / _graph->n()) <= _c_ctx.lp.two_hop_threshold;
+    return (1.0 - 1.0 * _current_num_clusters / _graph->n()) <= _lp_ctx.two_hop_threshold;
   }
 
   // @todo: old implementation that should no longer be used
@@ -171,15 +166,27 @@ class LPClusteringImpl final
         // If this works, we set ourself as clustering partners for nodes that have the same favored
         // cluster we have
         NodeID expected_value = favored_leader;
-        if (_favored_clusters[favored_leader].compare_exchange_strong(expected_value, u)) {
+        if (__atomic_compare_exchange_n(
+                &_favored_clusters[favored_leader],
+                &expected_value,
+                u,
+                false,
+                __ATOMIC_SEQ_CST,
+                __ATOMIC_SEQ_CST
+            )) {
           break;
         }
 
         // If this did not work, there is another node that has the same favored cluster
         // Try to join the cluster of that node
         const NodeID partner = expected_value;
-        if (_favored_clusters[favored_leader].compare_exchange_strong(
-                expected_value, favored_leader
+        if (__atomic_compare_exchange_n(
+                &_favored_clusters[favored_leader],
+                &expected_value,
+                favored_leader,
+                false,
+                __ATOMIC_SEQ_CST,
+                __ATOMIC_SEQ_CST
             )) {
           if (move_cluster_weight(u, partner, cluster_weight(u), max_cluster_weight(partner))) {
             move_node(u, partner);
@@ -216,7 +223,7 @@ class LPClusteringImpl final
   using Base::_current_num_clusters;
   using Base::_graph;
 
-  const CoarseningContext &_c_ctx;
+  const LabelPropagationCoarseningContext &_lp_ctx;
   NodeWeight _max_cluster_weight = kInvalidBlockWeight;
 };
 
@@ -224,22 +231,30 @@ class LPClusteringImpl final
 // Exposed wrapper
 //
 
-LPClustering::LPClustering(const NodeID max_n, const CoarseningContext &c_ctx)
-    : _core{std::make_unique<LPClusteringImpl>(max_n, c_ctx)} {}
+LegacyLPClustering::LegacyLPClustering(const CoarseningContext &c_ctx)
+    : _core(std::make_unique<LegacyLPClusteringImpl>(c_ctx)) {}
 
 // we must declare the destructor explicitly here, otherwise, it is implicitly
-// generated before LabelPropagationClusterCore is complete
-LPClustering::~LPClustering() = default;
+// generated before LegacyLabelPropagationClusterCore is complete
+LegacyLPClustering::~LegacyLPClustering() = default;
 
-void LPClustering::set_max_cluster_weight(const NodeWeight max_cluster_weight) {
+void LegacyLPClustering::set_max_cluster_weight(const NodeWeight max_cluster_weight) {
   _core->set_max_cluster_weight(max_cluster_weight);
 }
 
-void LPClustering::set_desired_cluster_count(const NodeID count) {
+void LegacyLPClustering::set_desired_cluster_count(const NodeID count) {
   _core->set_desired_num_clusters(count);
 }
 
-const Clusterer::AtomicClusterArray &LPClustering::compute_clustering(const Graph &graph) {
-  return _core->compute_clustering(graph);
+void LegacyLPClustering::compute_clustering(
+    StaticArray<NodeID> &clustering, const Graph &graph, bool
+) {
+  if (auto *csr_graph = dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+      csr_graph != nullptr) {
+    _core->compute_clustering(clustering, *csr_graph, false);
+    return;
+  }
+
+  __builtin_unreachable();
 }
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h
new file mode 100644
index 00000000..cfa45a9d
--- /dev/null
+++ b/kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h
@@ -0,0 +1,36 @@
+/******************************************************************************
+ * Label propagation for graph coarsening / clustering.
+ *
+ * @file:   legacy_lp_clusterer.h
+ * @author: Daniel Seemaier
+ * @date:   29.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/coarsening/clusterer.h"
+#include "kaminpar-shm/datastructures/graph.h"
+
+namespace kaminpar::shm {
+class LegacyLPClustering : public Clusterer {
+public:
+  LegacyLPClustering(const CoarseningContext &c_ctx);
+
+  LegacyLPClustering(const LegacyLPClustering &) = delete;
+  LegacyLPClustering &operator=(const LegacyLPClustering &) = delete;
+
+  LegacyLPClustering(LegacyLPClustering &&) noexcept = default;
+  LegacyLPClustering &operator=(LegacyLPClustering &&) noexcept = default;
+
+  ~LegacyLPClustering() override;
+
+  void set_max_cluster_weight(NodeWeight max_cluster_weight) final;
+  void set_desired_cluster_count(NodeID count) final;
+
+  void compute_clustering(
+      StaticArray<NodeID> &clustering, const Graph &graph, bool free_memory_afterwards
+  ) final;
+
+private:
+  std::unique_ptr<class LegacyLPClusteringImpl> _core;
+};
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/clustering/lp_clusterer.cc b/kaminpar-shm/coarsening/clustering/lp_clusterer.cc
new file mode 100644
index 00000000..bdbf8095
--- /dev/null
+++ b/kaminpar-shm/coarsening/clustering/lp_clusterer.cc
@@ -0,0 +1,365 @@
+/******************************************************************************
+ * Label propagation for graph coarsening / clustering.
+ *
+ * @file:   lp_clusterer.cc
+ * @author: Daniel Seemaier
+ * @date:   29.09.2021
+ ******************************************************************************/
+#include "kaminpar-shm/coarsening/clustering/lp_clusterer.h"
+
+#include "kaminpar-shm/label_propagation.h"
+
+#include "kaminpar-common/heap_profiler.h"
+#include "kaminpar-common/timer.h"
+
+namespace kaminpar::shm {
+
+//
+// Actual implementation -- not exposed in header
+//
+
+struct LPClusteringConfig : public LabelPropagationConfig {
+  using ClusterID = NodeID;
+  using ClusterWeight = BlockWeight;
+  static constexpr bool kTrackClusterCount = true;
+  static constexpr bool kUseTwoHopClustering = true;
+};
+
+template <typename Graph>
+class LPClusteringImpl final
+    : public ChunkRandomLabelPropagation<LPClusteringImpl<Graph>, LPClusteringConfig, Graph>,
+      public OwnedRelaxedClusterWeightVector<NodeID, NodeWeight>,
+      public NonatomicClusterVectorRef<NodeID, NodeID> {
+  SET_DEBUG(false);
+
+  using Base = ChunkRandomLabelPropagation<LPClusteringImpl, LPClusteringConfig, Graph>;
+  using ClusterWeightBase = OwnedRelaxedClusterWeightVector<NodeID, NodeWeight>;
+  using ClusterBase = NonatomicClusterVectorRef<NodeID, NodeID>;
+
+public:
+  using Permutations = Base::Permutations;
+
+  LPClusteringImpl(const CoarseningContext &c_ctx, Permutations &permutations)
+      : Base(permutations),
+        ClusterWeightBase(c_ctx.clustering.lp.use_two_level_cluster_weight_vector),
+        _lp_ctx(c_ctx.clustering.lp) {
+    Base::set_max_degree(_lp_ctx.large_degree_threshold);
+    Base::set_max_num_neighbors(_lp_ctx.max_num_neighbors);
+    Base::set_use_two_phases(_lp_ctx.use_two_phases);
+    Base::set_second_phase_select_mode(_lp_ctx.second_phase_select_mode);
+    Base::set_second_phase_aggregation_mode(_lp_ctx.second_phase_aggregation_mode);
+    Base::set_relabel_before_second_phase(_lp_ctx.relabel_before_second_phase);
+  }
+
+  void set_max_cluster_weight(const NodeWeight max_cluster_weight) {
+    _max_cluster_weight = max_cluster_weight;
+  }
+
+  void preinitialize(const NodeID num_nodes) {
+    Base::preinitialize(num_nodes, num_nodes);
+  }
+
+  void allocate(const NodeID num_clusters) {
+    SCOPED_HEAP_PROFILER("Allocation");
+    SCOPED_TIMER("Allocation");
+
+    Base::allocate();
+    ClusterWeightBase::allocate_cluster_weights(num_clusters);
+  }
+
+  void free() {
+    SCOPED_HEAP_PROFILER("Free");
+    SCOPED_TIMER("Free");
+
+    Base::free();
+    ClusterWeightBase::free();
+  }
+
+  void compute_clustering(StaticArray<NodeID> &clustering, const Graph &graph) {
+    ClusterWeightBase::reset_cluster_weights();
+    ClusterBase::init_clusters_ref(clustering);
+    Base::initialize(&graph, graph.n());
+
+    for (std::size_t iteration = 0; iteration < _lp_ctx.num_iterations; ++iteration) {
+      SCOPED_TIMER("Iteration", std::to_string(iteration));
+      if (Base::perform_iteration() == 0) {
+        break;
+      }
+
+      // Only relabel during the first iteration because afterwards the memory for the second phase
+      // is already allocated.
+      if (iteration == 0) {
+        Base::set_relabel_before_second_phase(false);
+      }
+    }
+
+    cluster_isolated_nodes();
+    cluster_two_hop_nodes();
+  }
+
+private:
+  void cluster_two_hop_nodes() {
+    SCOPED_HEAP_PROFILER("Handle two-hop nodes");
+    SCOPED_TIMER("Handle two-hop nodes");
+
+    if (!should_handle_two_hop_nodes()) {
+      return;
+    }
+
+    switch (_lp_ctx.two_hop_strategy) {
+    case TwoHopStrategy::MATCH:
+      Base::match_two_hop_nodes();
+      break;
+    case TwoHopStrategy::MATCH_THREADWISE:
+      Base::match_two_hop_nodes_threadwise();
+      break;
+    case TwoHopStrategy::CLUSTER:
+      Base::cluster_two_hop_nodes();
+      break;
+    case TwoHopStrategy::CLUSTER_THREADWISE:
+      Base::cluster_two_hop_nodes_threadwise();
+      break;
+    case TwoHopStrategy::LEGACY:
+      handle_two_hop_clustering_legacy();
+      break;
+    case TwoHopStrategy::DISABLE:
+      break;
+    }
+  }
+
+  void cluster_isolated_nodes() {
+    SCOPED_HEAP_PROFILER("Handle isolated nodes");
+    SCOPED_TIMER("Handle isolated nodes");
+
+    switch (_lp_ctx.isolated_nodes_strategy) {
+    case IsolatedNodesClusteringStrategy::MATCH:
+      Base::match_isolated_nodes();
+      break;
+    case IsolatedNodesClusteringStrategy::CLUSTER:
+      Base::cluster_isolated_nodes();
+      break;
+    case IsolatedNodesClusteringStrategy::MATCH_DURING_TWO_HOP:
+      if (should_handle_two_hop_nodes()) {
+        Base::match_isolated_nodes();
+      }
+      break;
+    case IsolatedNodesClusteringStrategy::CLUSTER_DURING_TWO_HOP:
+      if (should_handle_two_hop_nodes()) {
+        Base::cluster_isolated_nodes();
+      }
+      break;
+    case IsolatedNodesClusteringStrategy::KEEP:
+      break;
+    }
+  }
+
+  [[nodiscard]] bool should_handle_two_hop_nodes() const {
+    return (1.0 - 1.0 * _current_num_clusters / _graph->n()) <= _lp_ctx.two_hop_threshold;
+  }
+
+  // @todo: old implementation that should no longer be used
+  void handle_two_hop_clustering_legacy() {
+    // Reset _favored_clusters entries for nodes that are not considered for
+    // 2-hop clustering, i.e., nodes that are already clustered with at least one other node or
+    // nodes that have more weight than max_weight/2.
+    // Set _favored_clusters to dummy entry _graph->n() for isolated nodes
+    tbb::parallel_for<NodeID>(0, _graph->n(), [&](const NodeID u) {
+      if (u != cluster(u)) {
+        Base::_favored_clusters[u] = u;
+      } else {
+        const auto initial_weight = initial_cluster_weight(u);
+        const auto current_weight = ClusterWeightBase::cluster_weight(u);
+        const auto max_weight = max_cluster_weight(u);
+        if (current_weight != initial_weight || current_weight > max_weight / 2) {
+          Base::_favored_clusters[u] = u;
+        }
+      }
+    });
+
+    tbb::parallel_for<NodeID>(0, _graph->n(), [&](const NodeID u) {
+      // Abort once we have merged enough clusters to achieve the configured minimum shrink factor
+      if (Base::should_stop()) {
+        return;
+      }
+
+      // Skip nodes that should not be considered during 2-hop clustering
+      const NodeID favored_leader = Base::_favored_clusters[u];
+      if (favored_leader == u) {
+        return;
+      }
+
+      do {
+        // If this works, we set ourself as clustering partners for nodes that have the same favored
+        // cluster we have
+        NodeID expected_value = favored_leader;
+        if (__atomic_compare_exchange_n(
+                &Base::_favored_clusters[favored_leader],
+                &expected_value,
+                u,
+                false,
+                __ATOMIC_SEQ_CST,
+                __ATOMIC_SEQ_CST
+            )) {
+          break;
+        }
+
+        // If this did not work, there is another node that has the same favored cluster
+        // Try to join the cluster of that node
+        const NodeID partner = expected_value;
+        if (__atomic_compare_exchange_n(
+                &Base::_favored_clusters[favored_leader],
+                &expected_value,
+                favored_leader,
+                false,
+                __ATOMIC_SEQ_CST,
+                __ATOMIC_SEQ_CST
+            )) {
+          if (ClusterWeightBase::move_cluster_weight(
+                  u, partner, ClusterWeightBase::cluster_weight(u), max_cluster_weight(partner)
+              )) {
+            move_node(u, partner);
+            --_current_num_clusters;
+          }
+
+          break;
+        }
+      } while (true);
+    });
+  }
+
+public:
+  [[nodiscard]] NodeID initial_cluster(const NodeID u) {
+    return u;
+  }
+
+  [[nodiscard]] NodeWeight initial_cluster_weight(const NodeID cluster) {
+    return _graph->node_weight(cluster);
+  }
+
+  [[nodiscard]] NodeWeight max_cluster_weight(const NodeID /* cluster */) {
+    return _max_cluster_weight;
+  }
+
+  [[nodiscard]] bool accept_cluster(const Base::ClusterSelectionState &state) {
+    return (state.current_gain > state.best_gain ||
+            (state.current_gain == state.best_gain && state.local_rand.random_bool())) &&
+           (state.current_cluster_weight + state.u_weight <=
+                max_cluster_weight(state.current_cluster) ||
+            state.current_cluster == state.initial_cluster);
+  }
+
+  using Base::_current_num_clusters;
+  using Base::_graph;
+
+  const LabelPropagationCoarseningContext &_lp_ctx;
+  NodeWeight _max_cluster_weight = kInvalidBlockWeight;
+};
+
+class LPClusteringImplWrapper {
+public:
+  LPClusteringImplWrapper(const CoarseningContext &c_ctx)
+      : _csr_core(std::make_unique<LPClusteringImpl<CSRGraph>>(c_ctx, _permutations)),
+        _compact_csr_core(std::make_unique<LPClusteringImpl<CompactCSRGraph>>(c_ctx, _permutations)
+        ),
+        _compressed_core(std::make_unique<LPClusteringImpl<CompressedGraph>>(c_ctx, _permutations)
+        ) {}
+
+  void set_max_cluster_weight(const NodeWeight max_cluster_weight) {
+    _csr_core->set_max_cluster_weight(max_cluster_weight);
+    _compact_csr_core->set_max_cluster_weight(max_cluster_weight);
+    _compressed_core->set_max_cluster_weight(max_cluster_weight);
+  }
+
+  void set_desired_cluster_count(const NodeID count) {
+    _csr_core->set_desired_num_clusters(count);
+    _compact_csr_core->set_desired_num_clusters(count);
+    _compressed_core->set_desired_num_clusters(count);
+  }
+
+  void compute_clustering(
+      StaticArray<NodeID> &clustering, const Graph &graph, const bool free_memory_afterwards
+  ) {
+    // Compute a clustering and setup/release the data structures used by the core, so that they can
+    // be shared by all implementations.
+    const auto compute = [&](auto &core, auto &graph) {
+      if (_freed) {
+        _freed = false;
+        core.allocate(graph.n());
+      } else {
+        core.setup(std::move(_structs));
+        core.setup_cluster_weights(std::move(_cluster_weights));
+      }
+
+      core.compute_clustering(clustering, graph);
+
+      if (free_memory_afterwards) {
+        _freed = true;
+        core.free();
+      } else {
+        _structs = core.release();
+        _cluster_weights = core.take_cluster_weights();
+      }
+    };
+
+    const NodeID num_nodes = graph.n();
+    _csr_core->preinitialize(num_nodes);
+    _compact_csr_core->preinitialize(num_nodes);
+    _compressed_core->preinitialize(num_nodes);
+
+    if (auto *csr_graph = dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+        csr_graph != nullptr) {
+      compute(*_csr_core, *csr_graph);
+    } else if (auto *compact_csr_graph =
+                   dynamic_cast<const CompactCSRGraph *>(graph.underlying_graph());
+               compact_csr_graph != nullptr) {
+      compute(*_compact_csr_core, *compact_csr_graph);
+    } else if (auto *compressed_graph =
+                   dynamic_cast<const CompressedGraph *>(graph.underlying_graph());
+               compressed_graph != nullptr) {
+      compute(*_compressed_core, *compressed_graph);
+    }
+
+    // Only relabel clusters during the first iteration
+    _csr_core->set_relabel_before_second_phase(false);
+    _compact_csr_core->set_relabel_before_second_phase(false);
+    _compressed_core->set_relabel_before_second_phase(false);
+  }
+
+private:
+  std::unique_ptr<LPClusteringImpl<CSRGraph>> _csr_core;
+  std::unique_ptr<LPClusteringImpl<CompactCSRGraph>> _compact_csr_core;
+  std::unique_ptr<LPClusteringImpl<CompressedGraph>> _compressed_core;
+
+  // The data structures that are used by the LP clusterer and are shared between the
+  // different implementations.
+  bool _freed = true;
+  LPClusteringImpl<Graph>::Permutations _permutations;
+  LPClusteringImpl<Graph>::DataStructures _structs;
+  LPClusteringImpl<Graph>::ClusterWeights _cluster_weights;
+};
+
+//
+// Exposed wrapper
+//
+
+LPClustering::LPClustering(const CoarseningContext &c_ctx)
+    : _impl_wrapper(std::make_unique<LPClusteringImplWrapper>(c_ctx)) {}
+
+// we must declare the destructor explicitly here, otherwise, it is implicitly
+// generated before LPClusteringImplWrapper is complete
+LPClustering::~LPClustering() = default;
+
+void LPClustering::set_max_cluster_weight(const NodeWeight max_cluster_weight) {
+  _impl_wrapper->set_max_cluster_weight(max_cluster_weight);
+}
+
+void LPClustering::set_desired_cluster_count(const NodeID count) {
+  _impl_wrapper->set_desired_cluster_count(count);
+}
+
+void LPClustering::compute_clustering(
+    StaticArray<NodeID> &clustering, const Graph &graph, const bool free_memory_afterwards
+) {
+  return _impl_wrapper->compute_clustering(clustering, graph, free_memory_afterwards);
+}
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/lp_clustering.h b/kaminpar-shm/coarsening/clustering/lp_clusterer.h
similarity index 75%
rename from kaminpar-shm/coarsening/lp_clustering.h
rename to kaminpar-shm/coarsening/clustering/lp_clusterer.h
index 8e2b87bd..7e81bf06 100644
--- a/kaminpar-shm/coarsening/lp_clustering.h
+++ b/kaminpar-shm/coarsening/clustering/lp_clusterer.h
@@ -1,20 +1,22 @@
 /******************************************************************************
  * Label propagation for graph coarsening / clustering.
  *
- * @file:   lp_clustering.h
+ * @file:   lp_clusterer.h
  * @author: Daniel Seemaier
  * @date:   29.09.2021
  ******************************************************************************/
 #pragma once
 
+#include <memory>
+
 #include "kaminpar-shm/coarsening/clusterer.h"
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/datastructures/graph.h"
 
 namespace kaminpar::shm {
+
 class LPClustering : public Clusterer {
 public:
-  LPClustering(NodeID max_n, const CoarseningContext &c_ctx);
+  LPClustering(const CoarseningContext &c_ctx);
 
   LPClustering(const LPClustering &) = delete;
   LPClustering &operator=(const LPClustering &) = delete;
@@ -27,10 +29,12 @@ class LPClustering : public Clusterer {
   void set_max_cluster_weight(NodeWeight max_cluster_weight) final;
   void set_desired_cluster_count(NodeID count) final;
 
-  const AtomicClusterArray &compute_clustering(const Graph &graph) final;
+  void compute_clustering(
+      StaticArray<NodeID> &clustering, const Graph &graph, bool free_memory_afterwards
+  ) final;
 
 private:
-  std::unique_ptr<class LPClusteringImpl> _core;
+  std::unique_ptr<class LPClusteringImplWrapper> _impl_wrapper;
 };
 
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/clustering/noop_clusterer.h b/kaminpar-shm/coarsening/clustering/noop_clusterer.h
new file mode 100644
index 00000000..f6fdcd5d
--- /dev/null
+++ b/kaminpar-shm/coarsening/clustering/noop_clusterer.h
@@ -0,0 +1,45 @@
+/*******************************************************************************
+ * A dummy clusterer that assigns each node to its own singleton cluster.
+ *
+ * @file:   noop_clusterer.h
+ * @author: Daniel Seemaier
+ * @date:   16.06.2024
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/coarsening/clusterer.h"
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+namespace kaminpar::shm {
+class NoopClusterer : public Clusterer {
+public:
+  NoopClusterer() = default;
+
+  NoopClusterer(const NoopClusterer &) = delete;
+  NoopClusterer &operator=(const NoopClusterer &) = delete;
+
+  NoopClusterer(NoopClusterer &&) noexcept = default;
+  NoopClusterer &operator=(NoopClusterer &&) noexcept = default;
+
+  //
+  // Optional options
+  //
+
+  virtual void set_max_cluster_weight(const NodeWeight /* weight */) {}
+  virtual void set_desired_cluster_count(const NodeID /* count */) {}
+
+  //
+  // Clustering function
+  //
+
+  virtual void compute_clustering(
+      StaticArray<NodeID> &clustering, const Graph &graph, bool free_memory_afterwards
+  ) {
+    tbb::parallel_for<NodeID>(0, graph.n(), [&](const NodeID i) { clustering[i] = i; });
+  }
+};
+} // namespace kaminpar::shm
+
diff --git a/kaminpar-shm/coarsening/coarsener.h b/kaminpar-shm/coarsening/coarsener.h
index c075053d..e3e608e1 100644
--- a/kaminpar-shm/coarsening/coarsener.h
+++ b/kaminpar-shm/coarsening/coarsener.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Interface for graph coarseners.
+ * Interface for the coarsening phase of multilevel graph partitioning.
  *
  * @file:   coarsener.h
  * @author: Daniel Seemaier
@@ -13,11 +13,7 @@
 
 namespace kaminpar::shm {
 /**
- * Clustering graphutils.
- *
- * Call #coarsen() repeatedly to produce a hierarchy of coarse graph. The coarse
- * graphs are owned by the clustering graphutils. To unroll the graph hierarchy,
- * call #uncoarsen() with a partition of the currently coarsest graph.
+ * Interface for the coarsening phase of multilevel graph partitioning.
  */
 class Coarsener {
 public:
@@ -32,25 +28,32 @@ class Coarsener {
   virtual ~Coarsener() = default;
 
   /**
-   * Coarsen the currently coarsest graph with a static maximum node weight.
+   * Initializes the coarsener with a new toplevel graph.
+   */
+  virtual void initialize(const Graph *graph) = 0;
+
+  /**
+   * Computes the next level of the graph hierarchy.
    *
-   * @param max_cluster_weight Maximum node weight of the coarse graph.
-   * @param to_size Desired size of the coarse graph.
-   * @return New coarsest graph and whether coarsening has not converged.
+   * @return whether coarsening has *not* yet converged.
    */
-  virtual std::pair<const Graph *, bool>
-  compute_coarse_graph(NodeWeight max_cluster_weight, NodeID to_size) = 0;
+  virtual bool coarsen() = 0;
 
-  /** @return The currently coarsest graph, or the input graph, if no coarse
-   * graphs have been computed so far. */
-  [[nodiscard]] virtual const Graph *coarsest_graph() const = 0;
+  /**
+   * @return the coarsest graph in the hierarchy.
+   */
+  [[nodiscard]] virtual const Graph &current() const = 0;
 
-  /** @return Number of coarsest graphs that have already been computed. */
-  [[nodiscard]] virtual std::size_t size() const = 0;
+  /**
+   * @return number of coarse graphs in the hierarchy.
+   */
+  [[nodiscard]] virtual std::size_t level() const = 0;
 
-  /** @return Whether we have not computed any coarse graphs so far. */
+  /**
+   * @return whether we have *not* yet computed any coarse graphs.
+   */
   [[nodiscard]] bool empty() const {
-    return size() == 0;
+    return level() == 0;
   }
 
   /**
@@ -58,13 +61,11 @@ class Coarsener {
    * graph and frees the currently coarsest graph, i.e., unrolls one level of
    * the coarse graph hierarchy.
    *
-   * @param p_graph Partition of the currently coarsest graph, i.e.,
-   * `p_graph.graph() == *coarsest_graph()`.
-   * @return Partition of the new coarsest graph.
+   * @param p_graph Partition of the currently coarsest graph.
+   *                Precondition: `p_graph.graph() == current()`.
+   *
+   * @return partition of the *new* coarsest graph.
    */
   virtual PartitionedGraph uncoarsen(PartitionedGraph &&p_graph) = 0;
-
-  //! Re-initialize this coarsener object with a new graph.
-  virtual void initialize(const Graph *graph) = 0;
 };
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.cc b/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.cc
new file mode 100644
index 00000000..41f6f5b3
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.cc
@@ -0,0 +1,298 @@
+/*******************************************************************************
+ * Contraction implementation that uses an edge buffer to store edges before
+ * building the final graph.
+ *
+ * @file:   buffered_cluster_contraction.cc
+ * @author: Daniel Seemaier
+ * @author: Daniel Salwasser
+ * @date:   21.09.2021
+ ******************************************************************************/
+#include "kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.h"
+
+#include <memory>
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h"
+
+#include "kaminpar-common/datastructures/compact_static_array.h"
+#include "kaminpar-common/datastructures/rating_map.h"
+#include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/heap_profiler.h"
+#include "kaminpar-common/timer.h"
+
+namespace kaminpar::shm::contraction {
+namespace {
+template <template <typename> typename Mapping, typename Graph>
+std::unique_ptr<CoarseGraph> contract_clustering_buffered(
+    const Graph &graph,
+    const NodeID c_n,
+    Mapping<NodeID> mapping,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
+  auto &buckets_index = m_ctx.buckets_index;
+  auto &buckets = m_ctx.buckets;
+  auto &all_buffered_nodes = m_ctx.all_buffered_nodes;
+
+  START_TIMER("Allocation");
+  START_HEAP_PROFILER("Coarse graph node allocation");
+  RECORD("c_nodes") StaticArray<EdgeID> c_nodes(c_n + 1);
+  RECORD("c_node_weights") StaticArray<NodeWeight> c_node_weights(c_n);
+  STOP_HEAP_PROFILER();
+  STOP_TIMER();
+
+  // We build the coarse graph in multiple iterations. In each iteration we compute for a part of
+  // the coarse nodes the degree and node weight of each node. We can't insert the edges and edge
+  // weights into the arrays yet, because positioning edges in those arrays depends on c_nodes,
+  // which we only have after computing a prefix sum over the coarse node degrees in the current
+  // part. Thus, we store the edges and edge weights in a temporary buffer and compute the prefix
+  // sum and insert the edges and edge weights into the arrays after processing the part.
+
+  const EdgeID edge_count = graph.m();
+
+  // Split the coarse nodes into chunks, such that the edges of the fine graph are roughly split
+  // among the chunks.
+  START_TIMER("Compute coarse node chunks");
+  std::vector<std::pair<NodeID, NodeID>> cluster_chunks;
+
+  // If there are too few fine edges then don't split the coarse nodes into chunks, as this
+  // provides little memory benefits.
+  const bool split_coarse_nodes = edge_count >= 100000 && con_ctx.edge_buffer_fill_fraction < 1;
+  if (!split_coarse_nodes) {
+    cluster_chunks.emplace_back(0, c_n);
+  } else {
+    // Compute the fine degrees of the coarse nodes in parallel and store them temporarily in the
+    // (unused) c_nodes array.
+    tbb::parallel_for<NodeID>(0, c_n, [&](const NodeID c_u) {
+      const NodeID first = buckets_index[c_u];
+      const NodeID last = buckets_index[c_u + 1];
+
+      NodeID fine_degree = 0;
+      for (NodeID u = first; u < last; ++u) {
+        const NodeID v = buckets[u];
+        fine_degree += graph.degree(v);
+      }
+
+      c_nodes[c_u + 1] = fine_degree;
+    });
+
+    const EdgeID max_chunk_edge_count = edge_count * con_ctx.edge_buffer_fill_fraction;
+
+    NodeID chunk_start = 0;
+    NodeID chunk_edge_count = 0;
+    for (NodeID c_u = 0; c_u < c_n; ++c_u) {
+      const NodeID fine_degree = c_nodes[c_u + 1];
+      chunk_edge_count += fine_degree;
+
+      // With this coarse node the chunk would have more fine edges than the maximum allowed
+      // limit. Thus, create a new chunk.
+      if (chunk_edge_count >= max_chunk_edge_count) {
+        // It might happen that the chunk only consists of a high degree node which crosses the
+        // limit.
+        if (chunk_start == c_u) {
+          cluster_chunks.emplace_back(c_u, c_u + 1);
+          chunk_start = c_u + 1;
+        } else {
+          cluster_chunks.emplace_back(chunk_start, c_u);
+          chunk_start = c_u;
+        }
+
+        chunk_edge_count = 0;
+      }
+    }
+
+    // Create a chunk for the last coarse nodes, if the last coarse node did not cross the limit.
+    if (chunk_start != c_n) {
+      cluster_chunks.emplace_back(chunk_start, c_n);
+    }
+  }
+
+  STOP_TIMER();
+
+  // Overcomit memory for the edge and edge weight array as we only know the amount of edges of
+  // the coarse graph afterwards.
+  NodeID *c_edges;
+  EdgeWeight *c_edge_weights;
+  if constexpr (kHeapProfiling) {
+    // As we overcommit memory do not track the amount of bytes used directly. Instead record it
+    // manually afterwards.
+    c_edges = (NodeID *)heap_profiler::std_malloc(edge_count * sizeof(NodeID));
+    c_edge_weights = (EdgeWeight *)heap_profiler::std_malloc(edge_count * sizeof(EdgeWeight));
+  } else {
+    c_edges = (NodeID *)std::malloc(edge_count * sizeof(NodeID));
+    c_edge_weights = (EdgeWeight *)std::malloc(edge_count * sizeof(EdgeWeight));
+  }
+
+  START_HEAP_PROFILER("Construct coarse graph");
+  START_TIMER("Construct coarse graph");
+
+  tbb::enumerable_thread_specific<RatingMap<EdgeWeight, NodeID>> collector{[&] {
+    return RatingMap<EdgeWeight, NodeID>(c_n);
+  }};
+  NavigableLinkedList<NodeID, Edge, scalable_vector> edge_buffer_ets;
+
+  for (const auto [cluster_start, cluster_end] : cluster_chunks) {
+    tbb::parallel_for(tbb::blocked_range<NodeID>(cluster_start, cluster_end), [&](const auto &r) {
+      auto &local_collector = collector.local();
+      auto &local_edge_buffer = edge_buffer_ets.local();
+
+      for (NodeID c_u = r.begin(); c_u != r.end(); ++c_u) {
+        local_edge_buffer.mark(c_u);
+
+        const NodeID first = buckets_index[c_u];
+        const NodeID last = buckets_index[c_u + 1];
+
+        // Build coarse graph
+        const auto collect_edges = [&](auto &map) {
+          NodeWeight c_u_weight = 0;
+          for (NodeID i = first; i < last; ++i) {
+            const NodeID u = buckets[i];
+            KASSERT(mapping[u] == c_u);
+
+            c_u_weight += graph.node_weight(u); // coarse node weight
+
+            // collect coarse edges
+            graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
+              const NodeID c_v = mapping[v];
+              if (c_u != c_v) {
+                map[c_v] += graph.edge_weight(e);
+              }
+            });
+          }
+
+          c_node_weights[c_u] = c_u_weight; // coarse node weights are done now
+          c_nodes[c_u + 1] = map.size();    // node degree (used to build c_nodes)
+
+          // Since we don't know the value of c_nodes[c_u] yet (so far, it only holds the nodes
+          // degree), we can't place the edges of c_u in the c_edges and c_edge_weights arrays.
+          // Hence, we store them in auxiliary arrays and note their position in the auxiliary
+          // arrays
+          for (const auto [c_v, weight] : map.entries()) {
+            local_edge_buffer.push_back({c_v, weight});
+          }
+
+          map.clear();
+        };
+
+        // To select the right map, we need a upper bound on the coarse node degree. If we
+        // previously split the coarse nodes into chunks, we have already computed them and stored
+        // them in the c_nodes array.
+        NodeID upper_bound_degree;
+        if (split_coarse_nodes) {
+          upper_bound_degree = c_nodes[c_u + 1];
+        } else {
+          upper_bound_degree = 0;
+
+          for (NodeID i = first; i < last; ++i) {
+            const NodeID u = buckets[i];
+            upper_bound_degree += graph.degree(u);
+          }
+        }
+
+        local_collector.execute(upper_bound_degree, collect_edges);
+      }
+    });
+
+    parallel::prefix_sum(
+        c_nodes.begin() + cluster_start,
+        c_nodes.begin() + cluster_end + 1,
+        c_nodes.begin() + cluster_start
+    );
+
+    parallel::Atomic<std::size_t> global_pos = 0;
+    std::size_t num_markers = 0;
+    for (const auto &local_list : edge_buffer_ets) {
+      num_markers += local_list.markers().size();
+    }
+    if (all_buffered_nodes.size() < num_markers) {
+      all_buffered_nodes.resize(num_markers);
+    }
+
+    tbb::parallel_invoke(
+        [&] {
+          tbb::parallel_for(edge_buffer_ets.range(), [&](auto &r) {
+            for (auto &local_list : r) {
+              local_list.flush();
+            }
+          });
+        },
+        [&] {
+          tbb::parallel_for(edge_buffer_ets.range(), [&](const auto &r) {
+            for (const auto &local_list : r) {
+              const auto &markers = local_list.markers();
+              const std::size_t local_pos = global_pos.fetch_add(markers.size());
+              std::copy(markers.begin(), markers.end(), all_buffered_nodes.begin() + local_pos);
+            }
+          });
+        }
+    );
+
+    tbb::parallel_for<NodeID>(cluster_start, cluster_end, [&](const NodeID i) {
+      const auto &marker = all_buffered_nodes[i - cluster_start];
+      const auto *list = marker.local_list;
+      const NodeID c_u = marker.key;
+
+      const NodeID c_u_degree = c_nodes[c_u + 1] - c_nodes[c_u];
+      const EdgeID first_target_index = c_nodes[c_u];
+      const EdgeID first_source_index = marker.position;
+
+      for (EdgeID j = 0; j < c_u_degree; ++j) {
+        const auto to = first_target_index + j;
+        const auto [c_v, weight] = list->get(first_source_index + j);
+        c_edges[to] = c_v;
+        c_edge_weights[to] = weight;
+      }
+    });
+
+    edge_buffer_ets.clear();
+    all_buffered_nodes.free();
+  }
+
+  STOP_TIMER();
+  STOP_HEAP_PROFILER();
+
+  KASSERT(c_nodes[0] == 0u);
+  const EdgeID c_m = c_nodes.back();
+
+  START_HEAP_PROFILER("Coarse graph edges allocation");
+  RECORD("c_edges") StaticArray<NodeID> finalized_c_edges(c_m, c_edges);
+  RECORD("c_edge_weights") StaticArray<EdgeWeight> finalized_c_edge_weights(c_m, c_edge_weights);
+  if constexpr (kHeapProfiling) {
+    heap_profiler::HeapProfiler::global().record_alloc(c_edges, c_m * sizeof(NodeID));
+    heap_profiler::HeapProfiler::global().record_alloc(c_edge_weights, c_m * sizeof(EdgeWeight));
+  }
+  STOP_HEAP_PROFILER();
+
+  return std::make_unique<CoarseGraphImpl<Mapping>>(
+      shm::Graph(std::make_unique<CSRGraph>(
+          std::move(c_nodes),
+          std::move(finalized_c_edges),
+          std::move(c_node_weights),
+          std::move(finalized_c_edge_weights)
+      )),
+      std::move(mapping)
+  );
+}
+} // namespace
+
+std::unique_ptr<CoarseGraph> contract_clustering_buffered(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
+  if (con_ctx.use_compact_mapping) {
+    auto [c_n, mapping] = compute_mapping<CompactStaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_buffered(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  } else {
+    auto [c_n, mapping] = compute_mapping<StaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_buffered(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  }
+}
+} // namespace kaminpar::shm::contraction
diff --git a/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.h b/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.h
new file mode 100644
index 00000000..78937eb3
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.h
@@ -0,0 +1,25 @@
+/*******************************************************************************
+ * Contraction implementation that uses an edge buffer to store edges before
+ * building the final graph.
+ *
+ * @file:   buffered_cluster_contraction.h
+ * @author: Daniel Seemaier
+ * @author: Daniel Salwasser
+ * @date:   21.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+namespace kaminpar::shm::contraction {
+std::unique_ptr<CoarseGraph> contract_clustering_buffered(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+);
+}
diff --git a/kaminpar-shm/coarsening/contraction/cluster_contraction.cc b/kaminpar-shm/coarsening/contraction/cluster_contraction.cc
new file mode 100644
index 00000000..5b68379d
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/cluster_contraction.cc
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ * Contracts clusterings and constructs the coarse graph.
+ *
+ * @file:   cluster_contraction.cc
+ * @author: Daniel Seemaier
+ * @author: Daniel Salwasser
+ * @date:   21.09.2021
+ ******************************************************************************/
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+
+#include <memory>
+
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+// ... configurable contraction algorithms:
+#include "kaminpar-shm/coarsening/contraction/buffered_cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/naive_unbuffered_cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.h"
+
+namespace kaminpar::shm {
+using namespace contraction;
+
+std::unique_ptr<CoarseGraph> contract_clustering(
+    const Graph &graph, StaticArray<NodeID> &clustering, const ContractionCoarseningContext &con_ctx
+) {
+  MemoryContext m_ctx;
+  return contract_clustering(graph, clustering, con_ctx, m_ctx);
+}
+
+std::unique_ptr<CoarseGraph> contract_clustering(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
+  switch (con_ctx.mode) {
+  case ContractionMode::BUFFERED:
+    return contract_clustering_buffered(graph, clustering, con_ctx, m_ctx);
+  case ContractionMode::BUFFERED_LEGACY:
+    return contract_clustering_buffered_legacy(graph, clustering, con_ctx, m_ctx);
+  case ContractionMode::UNBUFFERED:
+    return contract_clustering_unbuffered(graph, clustering, con_ctx, m_ctx);
+  case ContractionMode::UNBUFFERED_NAIVE:
+    return contract_clustering_unbuffered_naive(graph, clustering, con_ctx, m_ctx);
+  }
+
+  __builtin_unreachable();
+}
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/contraction/cluster_contraction.h b/kaminpar-shm/coarsening/contraction/cluster_contraction.h
new file mode 100644
index 00000000..b119785c
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/cluster_contraction.h
@@ -0,0 +1,53 @@
+/*******************************************************************************
+ * Contracts clusterings and constructs the coarse graph.
+ *
+ * @file:   cluster_contraction.h
+ * @author: Daniel Seemaier
+ * @author: Daniel Salwasser
+ * @date:   21.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/datastructures/ts_navigable_linked_list.h"
+
+namespace kaminpar::shm {
+class CoarseGraph {
+public:
+  virtual ~CoarseGraph() = default;
+
+  virtual const Graph &get() const = 0;
+  virtual Graph &get() = 0;
+
+  virtual void project(const StaticArray<BlockID> &array, StaticArray<BlockID> &onto) = 0;
+};
+
+namespace contraction {
+struct Edge {
+  NodeID target;
+  EdgeWeight weight;
+};
+
+struct MemoryContext {
+  StaticArray<NodeID> buckets;
+  StaticArray<NodeID> buckets_index;
+  StaticArray<NodeID> leader_mapping;
+  StaticArray<NavigationMarker<NodeID, Edge, scalable_vector>> all_buffered_nodes;
+};
+} // namespace contraction
+
+std::unique_ptr<CoarseGraph> contract_clustering(
+    const Graph &graph, StaticArray<NodeID> &clustering, const ContractionCoarseningContext &con_ctx
+);
+
+std::unique_ptr<CoarseGraph> contract_clustering(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    contraction::MemoryContext &m_ctx
+);
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.cc b/kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.cc
new file mode 100644
index 00000000..a5fbec31
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.cc
@@ -0,0 +1,159 @@
+/*******************************************************************************
+ * Common preprocessing utilities for cluster contraction implementations.
+ *
+ * @file:   cluster_contraction_preprocessing.cc
+ * @author: Daniel Seemaier
+ * @author: Daniel Salwasser
+ * @date:   21.09.2021
+ ******************************************************************************/
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h"
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/compact_static_array.h"
+#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/heap_profiler.h"
+#include "kaminpar-common/timer.h"
+
+namespace kaminpar::shm::contraction {
+void fill_leader_mapping(
+    const Graph &graph, const StaticArray<NodeID> &clustering, StaticArray<NodeID> &leader_mapping
+) {
+  START_TIMER("Allocation");
+  if (leader_mapping.size() < graph.n()) {
+    leader_mapping.resize(graph.n());
+  }
+  STOP_TIMER();
+
+  RECORD("leader_mapping");
+  RECORD_LOCAL_DATA_STRUCT("StaticArray<NodeID>", leader_mapping.size() * sizeof(NodeID));
+
+  START_TIMER("Preprocessing");
+  graph.pfor_nodes([&](const NodeID u) { leader_mapping[u] = 0; });
+  graph.pfor_nodes([&](const NodeID u) {
+    __atomic_store_n(&leader_mapping[clustering[u]], 1, __ATOMIC_RELAXED);
+  });
+  parallel::prefix_sum(
+      leader_mapping.begin(), leader_mapping.begin() + graph.n(), leader_mapping.begin()
+  );
+  STOP_TIMER();
+}
+
+template <>
+StaticArray<NodeID> compute_mapping(
+    const Graph &graph,
+    const StaticArray<NodeID> &clustering,
+    const StaticArray<NodeID> &leader_mapping
+) {
+  START_TIMER("Allocation");
+  RECORD("mapping") StaticArray<NodeID> mapping(graph.n());
+  STOP_TIMER();
+
+  START_TIMER("Preprocessing");
+  graph.pfor_nodes([&](const NodeID u) {
+    mapping[u] = __atomic_load_n(&leader_mapping[clustering[u]], __ATOMIC_RELAXED) - 1;
+  });
+  STOP_TIMER();
+
+  return mapping;
+}
+
+template <>
+CompactStaticArray<NodeID> compute_mapping(
+    const Graph &graph,
+    const StaticArray<NodeID> &clustering,
+    const StaticArray<NodeID> &leader_mapping
+) {
+  const NodeID c_n = leader_mapping[graph.n() - 1];
+
+  START_TIMER("Allocation");
+  RECORD("mapping") CompactStaticArray<NodeID> mapping(math::byte_width(c_n), graph.n());
+  STOP_TIMER();
+
+  START_TIMER("Preprocessing");
+  graph.pfor_nodes([&](const NodeID u) {
+    mapping.write(u, __atomic_load_n(&leader_mapping[clustering[u]], __ATOMIC_RELAXED) - 1);
+  });
+  STOP_TIMER();
+
+  return mapping;
+}
+
+template <template <typename> typename Mapping>
+std::pair<NodeID, Mapping<NodeID>>
+compute_mapping(const Graph &graph, StaticArray<NodeID> &clustering, MemoryContext &m_ctx) {
+  fill_leader_mapping(graph, clustering, m_ctx.leader_mapping);
+  Mapping<NodeID> mapping = compute_mapping<Mapping>(graph, clustering, m_ctx.leader_mapping);
+  const NodeID c_n = m_ctx.leader_mapping[graph.n() - 1];
+
+  TIMED_SCOPE("Allocation") {
+    m_ctx.leader_mapping.free();
+    clustering.free();
+  };
+
+  return {c_n, std::move(mapping)};
+}
+
+template std::pair<NodeID, StaticArray<NodeID>> compute_mapping<StaticArray>(
+    const Graph &graph, StaticArray<NodeID> &clustering, MemoryContext &m_ctx
+);
+
+template std::pair<NodeID, CompactStaticArray<NodeID>> compute_mapping<CompactStaticArray>(
+    const Graph &graph, StaticArray<NodeID> &clustering, MemoryContext &m_ctx
+);
+
+template <typename Mapping>
+void fill_cluster_buckets(
+    const NodeID c_n,
+    const Graph &graph,
+    const Mapping &mapping,
+    StaticArray<NodeID> &buckets_index,
+    StaticArray<NodeID> &buckets
+) {
+  START_TIMER("Allocation");
+  if (buckets.size() < graph.n()) {
+    buckets.resize(graph.n());
+  }
+  if (buckets_index.size() < c_n + 1) {
+    buckets_index.resize(c_n + 1);
+  }
+  STOP_TIMER();
+
+  RECORD("buckets");
+  RECORD_LOCAL_DATA_STRUCT("StaticArray<NodeID>", buckets.size() * sizeof(NodeID));
+
+  RECORD("buckets_index");
+  RECORD_LOCAL_DATA_STRUCT("StaticArray<NodeID>", buckets_index.size() * sizeof(NodeID));
+
+  START_TIMER("Preprocessing");
+  tbb::parallel_for<NodeID>(0, c_n + 1, [&](const NodeID i) { buckets_index[i] = 0; });
+  graph.pfor_nodes([&](const NodeID u) {
+    __atomic_fetch_add(&buckets_index[mapping[u]], 1, __ATOMIC_RELAXED);
+  });
+  parallel::prefix_sum(
+      buckets_index.begin(), buckets_index.begin() + c_n + 1, buckets_index.begin()
+  );
+  graph.pfor_nodes([&](const NodeID u) {
+    buckets[__atomic_sub_fetch(&buckets_index[mapping[u]], 1, __ATOMIC_RELAXED)] = u;
+  });
+  STOP_TIMER();
+}
+
+template void fill_cluster_buckets(
+    NodeID c_n,
+    const Graph &graph,
+    const StaticArray<NodeID> &mapping,
+    StaticArray<NodeID> &buckets_index,
+    StaticArray<NodeID> &buckets
+);
+
+template void fill_cluster_buckets(
+    NodeID c_n,
+    const Graph &graph,
+    const CompactStaticArray<NodeID> &mapping,
+    StaticArray<NodeID> &buckets_index,
+    StaticArray<NodeID> &buckets
+);
+} // namespace kaminpar::shm::contraction
diff --git a/kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h b/kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h
new file mode 100644
index 00000000..33dba965
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h
@@ -0,0 +1,66 @@
+/*******************************************************************************
+ * Common preprocessing utilities for cluster contraction implementations.
+ *
+ * @file:   cluster_contraction_preprocessing.h
+ * @author: Daniel Seemaier
+ * @author: Daniel Salwasser
+ * @date:   21.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+namespace kaminpar::shm::contraction {
+template <template <typename> typename Mapping> class CoarseGraphImpl : public CoarseGraph {
+public:
+  CoarseGraphImpl(Graph graph, Mapping<NodeID> mapping)
+      : _graph(std::move(graph)),
+        _mapping(std::move(mapping)) {}
+
+  const Graph &get() const final {
+    return _graph;
+  }
+
+  Graph &get() final {
+    return _graph;
+  }
+
+  void project(const StaticArray<BlockID> &array, StaticArray<BlockID> &onto) final {
+    tbb::parallel_for<std::size_t>(0, onto.size(), [&](const std::size_t i) {
+      onto[i] = array[_mapping[i]];
+    });
+  }
+
+private:
+  Graph _graph;
+  Mapping<NodeID> _mapping;
+};
+
+void fill_leader_mapping(
+    const Graph &graph, const StaticArray<NodeID> &clustering, StaticArray<NodeID> &leader_mapping
+);
+
+template <template <typename> typename Mapping>
+Mapping<NodeID> compute_mapping(
+    const Graph &graph,
+    const StaticArray<NodeID> &clustering,
+    const StaticArray<NodeID> &leader_mapping
+);
+
+template <template <typename> typename Mapping>
+std::pair<NodeID, Mapping<NodeID>>
+compute_mapping(const Graph &graph, StaticArray<NodeID> &clustering, MemoryContext &m_ctx);
+
+template <typename Mapping>
+void fill_cluster_buckets(
+    const NodeID c_n,
+    const Graph &graph,
+    const Mapping &mapping,
+    StaticArray<NodeID> &buckets_index,
+    StaticArray<NodeID> &buckets
+);
+} // namespace kaminpar::shm::contraction
diff --git a/kaminpar-shm/graphutils/cluster_contraction.cc b/kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.cc
similarity index 56%
rename from kaminpar-shm/graphutils/cluster_contraction.cc
rename to kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.cc
index bf346b08..303c5a04 100644
--- a/kaminpar-shm/graphutils/cluster_contraction.cc
+++ b/kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.cc
@@ -1,100 +1,37 @@
 /*******************************************************************************
- * Contracts clusterings and constructs the coarse graph.
+ * Contraction implementation that uses an edge buffer to store edges before
+ * building the final graph.
  *
- * @file:   cluster_contraction.cc
+ * @file:   legacy_buffered_cluster_contraction.cc
  * @author: Daniel Seemaier
  * @date:   21.09.2021
  ******************************************************************************/
-#include "kaminpar-shm/graphutils/cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.h"
 
-#include <tbb/parallel_for.h>
-#include <tbb/parallel_invoke.h>
+#include <memory>
 
-#include "kaminpar-common/assert.h"
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h"
+
+#include "kaminpar-common/datastructures/compact_static_array.h"
 #include "kaminpar-common/datastructures/rating_map.h"
-#include "kaminpar-common/datastructures/ts_navigable_linked_list.h"
-#include "kaminpar-common/parallel/algorithm.h"
+#include "kaminpar-common/datastructures/static_array.h"
 #include "kaminpar-common/timer.h"
 
-namespace kaminpar::shm::graph {
-using namespace contraction;
-
+namespace kaminpar::shm::contraction {
 namespace {
-template <typename Clustering>
-Result
-contract_generic_clustering(const Graph &graph, const Clustering &clustering, MemoryContext m_ctx) {
+template <template <typename> typename Mapping, typename Graph>
+std::unique_ptr<CoarseGraph> contract_clustering_buffered_legacy(
+    const Graph &graph,
+    const NodeID c_n,
+    Mapping<NodeID> mapping,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
   auto &buckets_index = m_ctx.buckets_index;
   auto &buckets = m_ctx.buckets;
-  auto &leader_mapping = m_ctx.leader_mapping;
   auto &all_buffered_nodes = m_ctx.all_buffered_nodes;
 
-  START_TIMER("Allocation");
-  scalable_vector<NodeID> mapping(graph.n());
-  if (leader_mapping.size() < graph.n()) {
-    leader_mapping.resize(graph.n());
-  }
-  if (buckets.size() < graph.n()) {
-    buckets.resize(graph.n());
-  }
-  STOP_TIMER();
-
-  START_TIMER("Preprocessing");
-
-  //
-  // Compute a mapping from the nodes of the current graph to the nodes of the
-  // coarse graph I.e., node_mapping[node u] = coarse node c_u
-  //
-  // Note that clustering satisfies this invariant (I): if clustering[x] = y for
-  // some node x, then clustering[y] = y
-  //
-
-  // Set node_mapping[x] = 1 iff. there is a cluster with leader x
-  graph.pfor_nodes([&](const NodeID u) { leader_mapping[u] = 0; });
-  graph.pfor_nodes([&](const NodeID u) {
-    leader_mapping[clustering[u]].store(1, std::memory_order_relaxed);
-  });
-
-  // Compute prefix sum to get coarse node IDs (starting at 1!)
-  parallel::prefix_sum(
-      leader_mapping.begin(), leader_mapping.begin() + graph.n(), leader_mapping.begin()
-  );
-  const NodeID c_n = leader_mapping[graph.n() - 1]; // number of nodes in the coarse graph
-
-  // Assign coarse node ID to all nodes; this works due to (I)
-  graph.pfor_nodes([&](const NodeID u) { mapping[u] = leader_mapping[clustering[u]]; });
-  graph.pfor_nodes([&](const NodeID u) { --mapping[u]; });
-
-  STOP_TIMER();
-
-  TIMED_SCOPE("Allocation") {
-    buckets_index.clear();
-    buckets_index.resize(c_n + 1);
-  };
-
-  START_TIMER("Preprocessing");
-
-  //
-  // Sort nodes into buckets: place all nodes belonging to coarse node i into
-  // the i-th bucket
-  //
-  // Count the number of nodes in each bucket, then compute the position of the
-  // bucket in the global buckets array using a prefix sum, roughly 2/5-th of
-  // time on europe.osm with 2/3-th to 1/3-tel for loop to prefix sum
-  graph.pfor_nodes([&](const NodeID u) {
-    buckets_index[mapping[u]].fetch_add(1, std::memory_order_relaxed);
-  });
-
-  parallel::prefix_sum(buckets_index.begin(), buckets_index.end(), buckets_index.begin());
-  KASSERT(buckets_index.back() <= graph.n());
-
-  // Sort nodes into   buckets, roughly 3/5-th of time on europe.osm
-  tbb::parallel_for(static_cast<NodeID>(0), graph.n(), [&](const NodeID u) {
-    const std::size_t pos = buckets_index[mapping[u]].fetch_sub(1, std::memory_order_relaxed) - 1;
-    buckets[pos] = u;
-  });
-
-  STOP_TIMER(); // Preprocessing
-
   //
   // Build nodes array of the coarse graph
   // - firstly, we count the degree of each coarse node
@@ -146,12 +83,12 @@ contract_generic_clustering(const Graph &graph, const Clustering &clustering, Me
           c_u_weight += graph.node_weight(u); // coarse node weight
 
           // collect coarse edges
-          for (const auto [e, v] : graph.neighbors(u)) {
+          graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
             const NodeID c_v = mapping[v];
             if (c_u != c_v) {
               map[c_v] += graph.edge_weight(e);
             }
-          }
+          });
         }
 
         c_node_weights[c_u] = c_u_weight; // coarse node weights are done now
@@ -193,13 +130,13 @@ contract_generic_clustering(const Graph &graph, const Clustering &clustering, Me
   );
 
   START_TIMER("Allocation");
-  StaticArray<NodeID> c_edges{c_m};
-  StaticArray<EdgeWeight> c_edge_weights{c_m};
+  StaticArray<NodeID> c_edges(c_m);
+  StaticArray<EdgeWeight> c_edge_weights(c_m);
   STOP_TIMER();
 
   // build coarse graph
   START_TIMER("Construct coarse graph");
-  tbb::parallel_for(static_cast<NodeID>(0), c_n, [&](const NodeID i) {
+  tbb::parallel_for<NodeID>(0, c_n, [&](const NodeID i) {
     const auto &marker = all_buffered_nodes[i];
     const auto *list = marker.local_list;
     const NodeID c_u = marker.key;
@@ -217,27 +154,37 @@ contract_generic_clustering(const Graph &graph, const Clustering &clustering, Me
   });
   STOP_TIMER();
 
-  return {
-      Graph{
+  return std::make_unique<CoarseGraphImpl<Mapping>>(
+      shm::Graph(std::make_unique<CSRGraph>(
           std::move(c_nodes),
           std::move(c_edges),
           std::move(c_node_weights),
-          std::move(c_edge_weights)},
-      std::move(mapping),
-      std::move(m_ctx)};
+          std::move(c_edge_weights)
+      )),
+      std::move(mapping)
+  );
 }
 } // namespace
 
-Result
-contract(const Graph &graph, const scalable_vector<NodeID> &clustering, MemoryContext m_ctx) {
-  return contract_generic_clustering(graph, clustering, std::move(m_ctx));
-}
-
-Result contract(
+std::unique_ptr<CoarseGraph> contract_clustering_buffered_legacy(
     const Graph &graph,
-    const scalable_vector<parallel::Atomic<NodeID>> &clustering,
-    MemoryContext m_ctx
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
 ) {
-  return contract_generic_clustering(graph, clustering, std::move(m_ctx));
+  if (con_ctx.use_compact_mapping) {
+    auto [c_n, mapping] = compute_mapping<CompactStaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_buffered_legacy(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  } else {
+    auto [c_n, mapping] = compute_mapping<StaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_buffered_legacy(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  }
 }
-} // namespace kaminpar::shm::graph
+} // namespace kaminpar::shm::contraction
+
diff --git a/kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.h b/kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.h
new file mode 100644
index 00000000..68d3b98a
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/legacy_buffered_cluster_contraction.h
@@ -0,0 +1,25 @@
+/*******************************************************************************
+ * Contraction implementation that uses an edge buffer to store edges before
+ * building the final graph.
+ *
+ * @file:   legacy_buffered_cluster_contraction.h
+ * @author: Daniel Seemaier
+ * @date:   21.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+namespace kaminpar::shm::contraction {
+std::unique_ptr<CoarseGraph> contract_clustering_buffered_legacy(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+);
+}
+
diff --git a/kaminpar-shm/coarsening/contraction/naive_unbuffered_cluster_contraction.cc b/kaminpar-shm/coarsening/contraction/naive_unbuffered_cluster_contraction.cc
new file mode 100644
index 00000000..d19b5c85
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/naive_unbuffered_cluster_contraction.cc
@@ -0,0 +1,201 @@
+/*******************************************************************************
+ * @file:   naive_unbuffered_cluster_contraction.cc
+ * @author: Daniel Salwasser
+ * @date:   12.04.2024
+ ******************************************************************************/
+#include <memory>
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h"
+#include "kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.h"
+
+#include "kaminpar-common/datastructures/compact_static_array.h"
+#include "kaminpar-common/datastructures/rating_map.h"
+#include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/heap_profiler.h"
+#include "kaminpar-common/timer.h"
+
+namespace kaminpar::shm::contraction {
+namespace {
+template <template <typename> typename Mapping, typename Graph>
+std::unique_ptr<CoarseGraph> contract_clustering_unbuffered_naive(
+    const Graph &graph,
+    const NodeID c_n,
+    Mapping<NodeID> mapping,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
+  auto &buckets_index = m_ctx.buckets_index;
+  auto &buckets = m_ctx.buckets;
+  auto &all_buffered_nodes = m_ctx.all_buffered_nodes;
+
+  START_TIMER("Allocation");
+  START_HEAP_PROFILER("Coarse graph node allocation");
+  RECORD("c_nodes") StaticArray<EdgeID> c_nodes(c_n + 1);
+  RECORD("c_node_weights") StaticArray<NodeWeight> c_node_weights(c_n);
+  STOP_HEAP_PROFILER();
+  STOP_TIMER();
+
+  //
+  // We build the coarse graph in two steps. First, we compute the degree and node weight of each
+  // coarse node. We store the degree in the node array and the node weights in the node weight
+  // array. We compute a prefix sum over all coarse node degrees to have the correct offsets in
+  // the node array. Additionally, we compute the max edge weight. Then, we allocate the edge and
+  // edge weight array with compact IDs. In the second step, we compute the edges and edge weights
+  // again and store them in the corresponding arrays.
+  //
+
+  START_HEAP_PROFILER("Construct coarse nodes");
+  START_TIMER("Construct coarse nodes");
+
+  tbb::enumerable_thread_specific<RatingMap<EdgeWeight, NodeID>> collector{[&] {
+    return RatingMap<EdgeWeight, NodeID>(c_n);
+  }};
+  tbb::enumerable_thread_specific<std::size_t> max_edge_weight_ets;
+
+  tbb::parallel_for(tbb::blocked_range<NodeID>(0, c_n), [&](const auto &r) {
+    auto &local_collector = collector.local();
+
+    for (NodeID c_u = r.begin(); c_u != r.end(); ++c_u) {
+      const NodeID first = buckets_index[c_u];
+      const NodeID last = buckets_index[c_u + 1];
+
+      // Build coarse graph
+      const auto collect_edges = [&](auto &map) {
+        NodeWeight c_u_weight = 0;
+        for (NodeID i = first; i < last; ++i) {
+          const NodeID u = buckets[i];
+          KASSERT(mapping[u] == c_u);
+
+          c_u_weight += graph.node_weight(u);
+
+          // Collect coarse edges
+          graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
+            const NodeID c_v = mapping[v];
+            if (c_u != c_v) {
+              map[c_v] += graph.edge_weight(e);
+            }
+          });
+        }
+
+        c_nodes[c_u + 1] = map.size(); // Store node degree which is used to build c_nodes
+        c_node_weights[c_u] = c_u_weight;
+
+        std::size_t max_edge_weight = max_edge_weight_ets.local();
+        for (const auto [c_v, weight] : map.entries()) {
+          max_edge_weight = std::max(max_edge_weight, math::abs(weight));
+        }
+
+        max_edge_weight_ets.local() = max_edge_weight;
+        map.clear();
+      };
+
+      // To select the right map, we compute a upper bound on the coarse node degree by summing
+      // the degree of all fine nodes.
+      NodeID upper_bound_degree = 0;
+      for (NodeID i = first; i < last; ++i) {
+        const NodeID u = buckets[i];
+        upper_bound_degree += graph.degree(u);
+      }
+
+      local_collector.execute(upper_bound_degree, collect_edges);
+    }
+  });
+
+  parallel::prefix_sum(c_nodes.begin(), c_nodes.end(), c_nodes.begin());
+
+  std::size_t max_edge_weight = 0;
+  for (const std::size_t edge_weight : max_edge_weight_ets) {
+    max_edge_weight = std::max(max_edge_weight, edge_weight);
+  }
+
+  STOP_TIMER();
+  STOP_HEAP_PROFILER();
+
+  KASSERT(c_nodes[0] == 0u);
+  const EdgeID c_m = c_nodes.back();
+
+  START_HEAP_PROFILER("Coarse graph edges allocation");
+  START_TIMER("Allocation");
+  RECORD("c_edges") StaticArray<NodeID> c_edges(c_m, static_array::noinit);
+  RECORD("c_edge_weights")
+  StaticArray<EdgeWeight> c_edge_weights(c_m, static_array::noinit);
+  STOP_TIMER();
+  STOP_HEAP_PROFILER();
+
+  START_HEAP_PROFILER("Construct coarse edges");
+  START_TIMER("Construct coarse edges");
+
+  tbb::parallel_for(tbb::blocked_range<NodeID>(0, c_n), [&](const auto &r) {
+    auto &local_collector = collector.local();
+
+    for (NodeID c_u = r.begin(); c_u != r.end(); ++c_u) {
+      const NodeID first = buckets_index[c_u];
+      const NodeID last = buckets_index[c_u + 1];
+
+      // Build coarse graph
+      const auto collect_edges = [&](auto &map) {
+        for (NodeID i = first; i < last; ++i) {
+          const NodeID u = buckets[i];
+          KASSERT(mapping[u] == c_u);
+
+          // Collect coarse edges
+          graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
+            const NodeID c_v = mapping[v];
+            if (c_u != c_v) {
+              map[c_v] += graph.edge_weight(e);
+            }
+          });
+        }
+
+        EdgeID edge = c_nodes[c_u];
+        for (const auto [c_v, weight] : map.entries()) {
+          c_edges[edge] = c_v;
+          c_edge_weights[edge] = weight;
+          ++edge;
+        }
+
+        map.clear();
+      };
+
+      const NodeID degree = c_nodes[c_u + 1] - c_nodes[c_u];
+      local_collector.execute(degree, collect_edges);
+    }
+  });
+
+  STOP_TIMER();
+  STOP_HEAP_PROFILER();
+
+  return std::make_unique<CoarseGraphImpl<Mapping>>(
+      shm::Graph(std::make_unique<CSRGraph>(
+          std::move(c_nodes),
+          std::move(c_edges),
+          std::move(c_node_weights),
+          std::move(c_edge_weights)
+      )),
+      std::move(mapping)
+  );
+}
+} // namespace
+
+std::unique_ptr<CoarseGraph> contract_clustering_unbuffered_naive(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
+  if (con_ctx.use_compact_mapping) {
+    auto [c_n, mapping] = compute_mapping<CompactStaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_unbuffered_naive(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  } else {
+    auto [c_n, mapping] = compute_mapping<StaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_unbuffered_naive(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  }
+}
+} // namespace kaminpar::shm::contraction
diff --git a/kaminpar-shm/coarsening/contraction/naive_unbuffered_cluster_contraction.h b/kaminpar-shm/coarsening/contraction/naive_unbuffered_cluster_contraction.h
new file mode 100644
index 00000000..4889ead9
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/naive_unbuffered_cluster_contraction.h
@@ -0,0 +1,21 @@
+/*******************************************************************************
+ * @file:   naive_unbuffered_cluster_contraction.h
+ * @author: Daniel Salwasser
+ * @date:   12.04.2024
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+namespace kaminpar::shm::contraction {
+std::unique_ptr<CoarseGraph> contract_clustering_unbuffered_naive(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+);
+}
diff --git a/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.cc b/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.cc
new file mode 100644
index 00000000..3bbb4923
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.cc
@@ -0,0 +1,322 @@
+/*******************************************************************************
+ * @file:   unbuffered_cluster_contraction.cc
+ * @author: Daniel Salwasser
+ * @date:   12.04.2024
+ ******************************************************************************/
+#include "kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.h"
+
+#include <memory>
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction_preprocessing.h"
+
+#include "kaminpar-common/datastructures/compact_static_array.h"
+#include "kaminpar-common/datastructures/rating_map.h"
+#include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/heap_profiler.h"
+#include "kaminpar-common/timer.h"
+
+namespace kaminpar::shm::contraction {
+namespace {
+template <template <typename> typename Mapping> Mapping<NodeID> alloc_remapping(NodeID c_n);
+
+template <> CompactStaticArray<NodeID> alloc_remapping(const NodeID c_n) {
+  return {static_cast<std::uint8_t>(math::byte_width(c_n)), c_n};
+}
+
+template <> StaticArray<NodeID> alloc_remapping(const NodeID c_n) {
+  return {c_n};
+}
+
+template <template <typename> typename Mapping, typename Graph>
+std::unique_ptr<CoarseGraph> contract_clustering_unbuffered(
+    const Graph &graph,
+    const NodeID c_n,
+    Mapping<NodeID> mapping,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
+  auto &buckets_index = m_ctx.buckets_index;
+  auto &buckets = m_ctx.buckets;
+  auto &all_buffered_nodes = m_ctx.all_buffered_nodes;
+
+  START_TIMER("Allocation");
+  START_HEAP_PROFILER("Coarse graph node allocation");
+  RECORD("c_nodes") StaticArray<EdgeID> c_nodes(c_n + 1);
+  RECORD("c_node_weights") StaticArray<NodeWeight> c_node_weights(c_n);
+  STOP_HEAP_PROFILER();
+  STOP_TIMER();
+
+  // Overcomit memory for the edge and edge weight array as we only know the amount of edges of
+  // the coarse graph afterwards.
+  const EdgeID edge_count = graph.m();
+  NodeID *c_edges;
+  EdgeWeight *c_edge_weights;
+  if constexpr (kHeapProfiling) {
+    // As we overcommit memory do not track the amount of bytes used directly. Instead record it
+    // manually afterwards.
+    c_edges = (NodeID *)heap_profiler::std_malloc(edge_count * sizeof(NodeID));
+    c_edge_weights = (EdgeWeight *)heap_profiler::std_malloc(edge_count * sizeof(EdgeWeight));
+  } else {
+    c_edges = (NodeID *)std::malloc(edge_count * sizeof(NodeID));
+    c_edge_weights = (EdgeWeight *)std::malloc(edge_count * sizeof(EdgeWeight));
+  }
+
+  START_HEAP_PROFILER("Construct coarse graph");
+  START_TIMER("Construct coarse graph");
+
+  tbb::enumerable_thread_specific<RatingMap<EdgeWeight, NodeID>> collector{[&] {
+    return RatingMap<EdgeWeight, NodeID>(c_n);
+  }};
+  Mapping<NodeID> remapping = alloc_remapping<Mapping>(c_n);
+
+  const auto write_neighbourhood = [&](const NodeID c_u,
+                                       const NodeID new_c_u,
+                                       EdgeID edge,
+                                       const NodeWeight c_u_weight,
+                                       auto &map) {
+    remapping.write(c_u, new_c_u);
+
+    c_nodes[new_c_u] = edge;
+    c_node_weights[new_c_u] = c_u_weight;
+
+    for (const auto [c_v, weight] : map.entries()) {
+      c_edges[edge] = c_v;
+      c_edge_weights[edge] = weight;
+      edge += 1;
+    }
+  };
+
+  __uint128_t next_coarse_node_info = 0;
+  const auto &atomic_fetch_next_coarse_node_info = [&](std::uint64_t nodes, std::uint64_t degree) {
+    std::uint64_t old_c_v;
+    std::uint64_t old_edge;
+
+    bool success;
+    do {
+      __uint128_t expected = next_coarse_node_info;
+      old_c_v = (expected >> 64) & 0xFFFFFFFFFFFFFFFF;
+      old_edge = expected & 0xFFFFFFFFFFFFFFFF;
+
+      __uint128_t desired = (static_cast<__uint128_t>(old_c_v + nodes) << 64) |
+                            static_cast<__uint128_t>(old_edge + degree);
+      success = __sync_bool_compare_and_swap(&next_coarse_node_info, expected, desired);
+    } while (!success);
+
+    return std::make_pair(old_c_v, old_edge);
+  };
+
+  static constexpr NodeID kBufferSize = 30000;
+  tbb::enumerable_thread_specific<std::tuple<
+      NodeID,
+      EdgeID,
+      std::array<NodeID, kBufferSize>,
+      std::array<EdgeID, kBufferSize>,
+      std::array<NodeWeight, kBufferSize>,
+      std::array<NodeID, kBufferSize>,
+      std::array<EdgeWeight, kBufferSize>>>
+      edge_buffer_ets;
+  const auto flush_buffer = [&](NodeID num_buffered_nodes,
+                                EdgeID num_buffered_edges,
+                                auto &remapping_buffer,
+                                auto &node_buffer,
+                                auto &node_weight_buffer,
+                                auto &edge_buffer,
+                                auto &edge_weight_buffer,
+                                NodeID new_c_u,
+                                EdgeID edge) {
+    std::memcpy(
+        c_node_weights.data() + new_c_u,
+        node_weight_buffer.data(),
+        num_buffered_nodes * sizeof(NodeWeight)
+    );
+    std::memcpy(c_edges + edge, edge_buffer.data(), num_buffered_edges * sizeof(NodeID));
+    std::memcpy(
+        c_edge_weights + edge, edge_weight_buffer.data(), num_buffered_edges * sizeof(EdgeWeight)
+    );
+
+    for (NodeID i = 0; i < num_buffered_nodes; ++i) {
+      remapping.write(remapping_buffer[i], new_c_u + i);
+
+      c_nodes[new_c_u + i] = edge;
+      edge += node_buffer[i];
+    }
+  };
+
+  tbb::parallel_for(tbb::blocked_range<NodeID>(0, c_n), [&](const auto &r) {
+    auto &local_collector = collector.local();
+    auto
+        &[num_buffered_nodes,
+          num_buffered_edges,
+          remapping_buffer,
+          node_buffer,
+          node_weight_buffer,
+          edge_buffer,
+          edge_weight_buffer] = edge_buffer_ets.local();
+
+    for (NodeID c_u = r.begin(); c_u != r.end(); ++c_u) {
+      const NodeID first = buckets_index[c_u];
+      const NodeID last = buckets_index[c_u + 1];
+
+      // Build coarse graph
+      const auto collect_edges = [&](auto &map) {
+        NodeWeight c_u_weight = 0;
+        for (NodeID i = first; i < last; ++i) {
+          const NodeID u = buckets[i];
+          KASSERT(mapping[u] == c_u);
+
+          c_u_weight += graph.node_weight(u);
+
+          graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
+            const NodeID c_v = mapping[v];
+            if (c_u != c_v) {
+              map[c_v] += graph.edge_weight(e);
+            }
+          });
+        }
+
+        const std::size_t degree = map.size();
+        if (degree >= kBufferSize) {
+          auto [new_c_u, edge] = atomic_fetch_next_coarse_node_info(1, degree);
+          write_neighbourhood(c_u, new_c_u, edge, c_u_weight, map);
+        } else if (num_buffered_nodes >= kBufferSize - 1 ||
+                   num_buffered_edges + degree >= kBufferSize) {
+          const auto [new_c_u, edge] = atomic_fetch_next_coarse_node_info(
+              num_buffered_nodes + 1, num_buffered_edges + degree
+          );
+
+          flush_buffer(
+              num_buffered_nodes,
+              num_buffered_edges,
+              remapping_buffer,
+              node_buffer,
+              node_weight_buffer,
+              edge_buffer,
+              edge_weight_buffer,
+              new_c_u,
+              edge
+          );
+
+          write_neighbourhood(
+              c_u, new_c_u + num_buffered_nodes, edge + num_buffered_edges, c_u_weight, map
+          );
+
+          num_buffered_nodes = 0;
+          num_buffered_edges = 0;
+        } else {
+          remapping_buffer[num_buffered_nodes] = c_u;
+          node_buffer[num_buffered_nodes] = degree;
+          node_weight_buffer[num_buffered_nodes] = c_u_weight;
+          num_buffered_nodes += 1;
+
+          for (const auto [c_v, weight] : map.entries()) {
+            edge_buffer[num_buffered_edges] = c_v;
+            edge_weight_buffer[num_buffered_edges] = weight;
+            num_buffered_edges += 1;
+          }
+        }
+
+        map.clear();
+      };
+
+      // To select the right map, we need a upper bound on the coarse node degree. If we
+      // previously split the coarse nodes into chunks, we have already computed them and stored
+      // them in the c_nodes array.
+      NodeID upper_bound_degree = 0;
+      for (NodeID i = first; i < last; ++i) {
+        const NodeID u = buckets[i];
+        upper_bound_degree += graph.degree(u);
+      }
+
+      local_collector.execute(upper_bound_degree, collect_edges);
+    }
+  });
+
+  tbb::parallel_for(edge_buffer_ets.range(), [&](auto &r) {
+    for (auto
+             &[num_buffered_nodes,
+               num_buffered_edges,
+               remapping_buffer,
+               node_buffer,
+               node_weight_buffer,
+               edge_buffer,
+               edge_weight_buffer] : r) {
+      if (num_buffered_nodes > 0) {
+        auto [new_c_u, edge] =
+            atomic_fetch_next_coarse_node_info(num_buffered_nodes, num_buffered_edges);
+
+        flush_buffer(
+            num_buffered_nodes,
+            num_buffered_edges,
+            remapping_buffer,
+            node_buffer,
+            node_weight_buffer,
+            edge_buffer,
+            edge_weight_buffer,
+            new_c_u,
+            edge
+        );
+      }
+    }
+  });
+
+  const EdgeID c_m = next_coarse_node_info & 0xFFFFFFFFFFFFFFFF;
+  c_nodes[c_n] = c_m;
+
+  tbb::parallel_for(tbb::blocked_range<EdgeID>(0, c_m), [&](const auto &r) {
+    for (EdgeID e = r.begin(); e != r.end(); ++e) {
+      c_edges[e] = remapping[c_edges[e]];
+    }
+  });
+
+  tbb::parallel_for(tbb::blocked_range<NodeID>(0, graph.n()), [&](const auto &r) {
+    for (NodeID u = r.begin(); u != r.end(); ++u) {
+      mapping.write(u, remapping[mapping[u]]);
+    }
+  });
+
+  STOP_TIMER();
+  STOP_HEAP_PROFILER();
+
+  START_HEAP_PROFILER("Coarse graph edges allocation");
+  RECORD("c_edges") StaticArray<NodeID> finalized_c_edges(c_m, c_edges);
+  RECORD("c_edge_weights") StaticArray<EdgeWeight> finalized_c_edge_weights(c_m, c_edge_weights);
+  if constexpr (kHeapProfiling) {
+    heap_profiler::HeapProfiler::global().record_alloc(c_edges, c_m * sizeof(NodeID));
+    heap_profiler::HeapProfiler::global().record_alloc(c_edge_weights, c_m * sizeof(EdgeWeight));
+  }
+  STOP_HEAP_PROFILER();
+
+  return std::make_unique<CoarseGraphImpl<Mapping>>(
+      shm::Graph(std::make_unique<CSRGraph>(
+          std::move(c_nodes),
+          std::move(finalized_c_edges),
+          std::move(c_node_weights),
+          std::move(finalized_c_edge_weights)
+      )),
+      std::move(mapping)
+  );
+}
+} // namespace
+
+std::unique_ptr<CoarseGraph> contract_clustering_unbuffered(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+) {
+  if (con_ctx.use_compact_mapping) {
+    auto [c_n, mapping] = compute_mapping<CompactStaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_unbuffered(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  } else {
+    auto [c_n, mapping] = compute_mapping<StaticArray>(graph, clustering, m_ctx);
+    fill_cluster_buckets(c_n, graph, mapping, m_ctx.buckets_index, m_ctx.buckets);
+    return graph.reified([&](auto &graph) {
+      return contract_clustering_unbuffered(graph, c_n, std::move(mapping), con_ctx, m_ctx);
+    });
+  }
+}
+} // namespace kaminpar::shm::contraction
diff --git a/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.h b/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.h
new file mode 100644
index 00000000..90d6c1e0
--- /dev/null
+++ b/kaminpar-shm/coarsening/contraction/unbuffered_cluster_contraction.h
@@ -0,0 +1,22 @@
+/*******************************************************************************
+ * @file:   unbuffered_cluster_contraction.h
+ * @author: Daniel Salwasser
+ * @date:   12.04.2024
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+namespace kaminpar::shm::contraction {
+std::unique_ptr<CoarseGraph> contract_clustering_unbuffered(
+    const Graph &graph,
+    StaticArray<NodeID> &clustering,
+    const ContractionCoarseningContext &con_ctx,
+    MemoryContext &m_ctx
+);
+}
+
diff --git a/kaminpar-shm/coarsening/max_cluster_weights.h b/kaminpar-shm/coarsening/max_cluster_weights.h
new file mode 100644
index 00000000..3c834a71
--- /dev/null
+++ b/kaminpar-shm/coarsening/max_cluster_weights.h
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ * Utility functions to compute the maximum allowed cluster weight during
+ * coarsening.
+ *
+ * @file:   max_cluster_weights.h
+ * @author: Daniel Seemaier
+ * @date:   13.03.2023
+ ******************************************************************************/
+#pragma once
+
+#include <cstdint>
+
+#include "kaminpar-shm/kaminpar.h"
+
+namespace kaminpar::shm {
+template <typename NodeWeight, typename PartitionContext>
+NodeWeight compute_max_cluster_weight(
+    const CoarseningContext &c_ctx,
+    const PartitionContext &p_ctx,
+    const std::uint64_t n,
+    const std::int64_t total_node_weight
+) {
+  double max_cluster_weight = 0.0;
+
+  switch (c_ctx.clustering.cluster_weight_limit) {
+  case ClusterWeightLimit::EPSILON_BLOCK_WEIGHT:
+    max_cluster_weight = (p_ctx.epsilon * total_node_weight) /
+                         std::clamp<BlockID>(n / c_ctx.contraction_limit, 2, p_ctx.k);
+    break;
+
+  case ClusterWeightLimit::BLOCK_WEIGHT:
+    max_cluster_weight = (1.0 + p_ctx.epsilon) * total_node_weight / p_ctx.k;
+    break;
+
+  case ClusterWeightLimit::ONE:
+    max_cluster_weight = 1.0;
+    break;
+
+  case ClusterWeightLimit::ZERO:
+    max_cluster_weight = 0.0;
+    break;
+  }
+
+  return static_cast<NodeWeight>(max_cluster_weight * c_ctx.clustering.cluster_weight_multiplier);
+}
+
+template <typename NodeWeight, typename CoarseningContext, typename PartitionContext>
+NodeWeight compute_max_cluster_weight(
+    const CoarseningContext &c_ctx,
+    const PartitionContext &p_ctx,
+    const std::uint64_t n,
+    const std::int64_t total_node_weight
+) {
+  double max_cluster_weight = 0.0;
+
+  switch (c_ctx.cluster_weight_limit) {
+  case ClusterWeightLimit::EPSILON_BLOCK_WEIGHT:
+    max_cluster_weight = (p_ctx.epsilon * total_node_weight) /
+                         std::clamp<BlockID>(n / c_ctx.contraction_limit, 2, p_ctx.k);
+    break;
+
+  case ClusterWeightLimit::BLOCK_WEIGHT:
+    max_cluster_weight = (1.0 + p_ctx.epsilon) * total_node_weight / p_ctx.k;
+    break;
+
+  case ClusterWeightLimit::ONE:
+    max_cluster_weight = 1.0;
+    break;
+
+  case ClusterWeightLimit::ZERO:
+    max_cluster_weight = 0.0;
+    break;
+  }
+
+  return static_cast<NodeWeight>(max_cluster_weight * c_ctx.cluster_weight_multiplier);
+}
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/coarsening/noop_coarsener.h b/kaminpar-shm/coarsening/noop_coarsener.h
index d39b1914..243580c3 100644
--- a/kaminpar-shm/coarsening/noop_coarsener.h
+++ b/kaminpar-shm/coarsening/noop_coarsener.h
@@ -19,18 +19,16 @@ class NoopCoarsener : public Coarsener {
     _graph = graph;
   }
 
-  std::pair<const Graph *, bool> compute_coarse_graph(
-      const NodeWeight /* max_cluster_weight */, const NodeID /* to_size */
-  ) final {
-    return {coarsest_graph(), false};
+  bool coarsen() final {
+    return false;
   }
 
-  [[nodiscard]] std::size_t size() const final {
+  [[nodiscard]] std::size_t level() const final {
     return 0;
   }
 
-  [[nodiscard]] const Graph *coarsest_graph() const final {
-    return _graph;
+  [[nodiscard]] const Graph &current() const final {
+    return *_graph;
   }
 
   PartitionedGraph uncoarsen(PartitionedGraph &&p_graph) final {
@@ -38,6 +36,6 @@ class NoopCoarsener : public Coarsener {
   }
 
 private:
-  const Graph *_graph{nullptr};
+  const Graph *_graph = nullptr;
 };
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/context.cc b/kaminpar-shm/context.cc
index 50f06507..666237ee 100644
--- a/kaminpar-shm/context.cc
+++ b/kaminpar-shm/context.cc
@@ -7,23 +7,44 @@
  ******************************************************************************/
 #include "kaminpar-shm/context.h"
 
-#include <iomanip>
-#include <unordered_map>
-
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/partition_utils.h"
 
 #include "kaminpar-common/assert.h"
-#include "kaminpar-common/asserting_cast.h"
-#include "kaminpar-common/console_io.h"
-#include "kaminpar-common/math.h"
 
 namespace kaminpar::shm {
+
+void GraphCompressionContext::setup(const Graph &graph) {
+  high_degree_encoding = CompressedGraph::kHighDegreeEncoding;
+  high_degree_threshold = CompressedGraph::kHighDegreeThreshold;
+  high_degree_part_length = CompressedGraph::kHighDegreePartLength;
+  interval_encoding = CompressedGraph::kIntervalEncoding;
+  interval_length_treshold = CompressedGraph::kIntervalLengthTreshold;
+  run_length_encoding = CompressedGraph::kRunLengthEncoding;
+  stream_encoding = CompressedGraph::kStreamEncoding;
+  isolated_nodes_separation = CompressedGraph::kIsolatedNodesSeparation;
+
+  if (enabled) {
+    if (const auto *compressed_graph =
+            dynamic_cast<const CompressedGraph *>(graph.underlying_graph());
+        compressed_graph != nullptr) {
+      dismissed = false;
+      compression_ratio = compressed_graph->compression_ratio();
+      size_reduction = compressed_graph->size_reduction();
+      high_degree_count = compressed_graph->high_degree_count();
+      part_count = compressed_graph->part_count();
+      interval_count = compressed_graph->interval_count();
+    } else {
+      dismissed = true;
+    }
+  }
+}
+
 //
 // PartitionContext
 //
 
-void PartitionContext::setup(const Graph &graph) {
+void PartitionContext::setup(const AbstractGraph &graph) {
   n = graph.n();
   m = graph.m();
   total_node_weight = graph.total_node_weight();
@@ -118,6 +139,7 @@ void BlockWeightsContext::setup(const PartitionContext &p_ctx, const BlockID inp
 }
 
 void Context::setup(const Graph &graph) {
+  compression.setup(graph);
   partition.setup(graph);
 }
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/context_io.cc b/kaminpar-shm/context_io.cc
index 7bffe322..a5f604d6 100644
--- a/kaminpar-shm/context_io.cc
+++ b/kaminpar-shm/context_io.cc
@@ -15,24 +15,65 @@
 #include "kaminpar-common/console_io.h"
 #include "kaminpar-common/random.h"
 #include "kaminpar-common/strutils.h"
+#include "kaminpar-common/varint_codec.h"
 
 namespace kaminpar::shm {
 using namespace std::string_literals;
 
-std::unordered_map<std::string, GraphOrdering> get_graph_orderings() {
+std::unordered_map<std::string, NodeOrdering> get_node_orderings() {
   return {
-      {"natural", GraphOrdering::NATURAL},
-      {"deg-buckets", GraphOrdering::DEGREE_BUCKETS},
-      {"degree-buckets", GraphOrdering::DEGREE_BUCKETS},
+      {"natural", NodeOrdering::NATURAL},
+      {"deg-buckets", NodeOrdering::DEGREE_BUCKETS},
+      {"degree-buckets", NodeOrdering::DEGREE_BUCKETS},
+      {"implicit-deg-buckets", NodeOrdering::IMPLICIT_DEGREE_BUCKETS},
+      {"implicit-degree-buckets", NodeOrdering::IMPLICIT_DEGREE_BUCKETS},
   };
 }
 
-std::ostream &operator<<(std::ostream &out, const GraphOrdering ordering) {
+std::ostream &operator<<(std::ostream &out, const NodeOrdering ordering) {
   switch (ordering) {
-  case GraphOrdering::NATURAL:
+  case NodeOrdering::NATURAL:
     return out << "natural";
-  case GraphOrdering::DEGREE_BUCKETS:
+  case NodeOrdering::DEGREE_BUCKETS:
     return out << "deg-buckets";
+  case NodeOrdering::IMPLICIT_DEGREE_BUCKETS:
+    return out << "implicit-deg-buckets";
+  }
+
+  return out << "<invalid>";
+}
+
+std::unordered_map<std::string, EdgeOrdering> get_edge_orderings() {
+  return {
+      {"natural", EdgeOrdering::NATURAL},
+      {"compression", EdgeOrdering::COMPRESSION},
+  };
+}
+
+std::ostream &operator<<(std::ostream &out, const EdgeOrdering ordering) {
+  switch (ordering) {
+  case EdgeOrdering::NATURAL:
+    return out << "natural";
+  case EdgeOrdering::COMPRESSION:
+    return out << "compression";
+  }
+
+  return out << "<invalid>";
+}
+
+std::unordered_map<std::string, CoarseningAlgorithm> get_coarsening_algorithms() {
+  return {
+      {"noop", CoarseningAlgorithm::NOOP},
+      {"clustering", CoarseningAlgorithm::CLUSTERING},
+  };
+}
+
+std::ostream &operator<<(std::ostream &out, const CoarseningAlgorithm algorithm) {
+  switch (algorithm) {
+  case CoarseningAlgorithm::NOOP:
+    return out << "noop";
+  case CoarseningAlgorithm::CLUSTERING:
+    return out << "clustering";
   }
 
   return out << "<invalid>";
@@ -42,6 +83,7 @@ std::unordered_map<std::string, ClusteringAlgorithm> get_clustering_algorithms()
   return {
       {"noop", ClusteringAlgorithm::NOOP},
       {"lp", ClusteringAlgorithm::LABEL_PROPAGATION},
+      {"legacy-lp", ClusteringAlgorithm::LEGACY_LABEL_PROPAGATION},
   };
 }
 
@@ -51,7 +93,10 @@ std::ostream &operator<<(std::ostream &out, const ClusteringAlgorithm algorithm)
     return out << "noop";
   case ClusteringAlgorithm::LABEL_PROPAGATION:
     return out << "lp";
+  case ClusteringAlgorithm::LEGACY_LABEL_PROPAGATION:
+    return out << "legacy-lp";
   }
+
   return out << "<invalid>";
 }
 
@@ -82,6 +127,7 @@ std::unordered_map<std::string, RefinementAlgorithm> get_kway_refinement_algorit
   return {
       {"noop", RefinementAlgorithm::NOOP},
       {"lp", RefinementAlgorithm::LABEL_PROPAGATION},
+      {"legacy-lp", RefinementAlgorithm::LEGACY_LABEL_PROPAGATION},
       {"fm", RefinementAlgorithm::KWAY_FM},
       {"jet", RefinementAlgorithm::JET},
       {"greedy-balancer", RefinementAlgorithm::GREEDY_BALANCER},
@@ -97,6 +143,8 @@ std::ostream &operator<<(std::ostream &out, const RefinementAlgorithm algorithm)
     return out << "fm";
   case RefinementAlgorithm::LABEL_PROPAGATION:
     return out << "lp";
+  case RefinementAlgorithm::LEGACY_LABEL_PROPAGATION:
+    return out << "legacy-lp";
   case RefinementAlgorithm::GREEDY_BALANCER:
     return out << "greedy-balancer";
   case RefinementAlgorithm::JET:
@@ -242,6 +290,45 @@ std::ostream &operator<<(std::ostream &out, IsolatedNodesClusteringStrategy stra
   return out << "<invalid>";
 }
 
+std::ostream &operator<<(std::ostream &out, SecondPhaseSelectMode strategy) {
+  switch (strategy) {
+  case SecondPhaseSelectMode::HIGH_DEGREE:
+    return out << "high-degree";
+  case SecondPhaseSelectMode::FULL_RATING_MAP:
+    return out << "full-rating-map";
+  }
+
+  return out << "<invalid>";
+}
+
+std::unordered_map<std::string, SecondPhaseSelectMode> get_second_phase_select_modes() {
+  return {
+      {"high-degree", SecondPhaseSelectMode::HIGH_DEGREE},
+      {"full-rating-map", SecondPhaseSelectMode::FULL_RATING_MAP}
+  };
+}
+
+std::ostream &operator<<(std::ostream &out, SecondPhaseAggregationMode strategy) {
+  switch (strategy) {
+  case SecondPhaseAggregationMode::NONE:
+    return out << "none";
+  case SecondPhaseAggregationMode::DIRECT:
+    return out << "direct";
+  case SecondPhaseAggregationMode::BUFFERED:
+    return out << "buffered";
+  }
+
+  return out << "<invalid>";
+}
+
+std::unordered_map<std::string, SecondPhaseAggregationMode> get_second_phase_aggregation_modes() {
+  return {
+      {"none", SecondPhaseAggregationMode::NONE},
+      {"direct", SecondPhaseAggregationMode::DIRECT},
+      {"buffered", SecondPhaseAggregationMode::BUFFERED}
+  };
+}
+
 std::unordered_map<std::string, IsolatedNodesClusteringStrategy>
 get_isolated_nodes_clustering_strategies() {
   return {
@@ -253,23 +340,137 @@ get_isolated_nodes_clustering_strategies() {
   };
 }
 
+void print(const GraphCompressionContext &c_ctx, std::ostream &out) {
+  out << "Enabled:                      " << (c_ctx.enabled ? "yes" : "no") << "\n";
+  if (c_ctx.enabled) {
+    out << "Compression Scheme:           " << "Gap Encoding + ";
+    if (c_ctx.run_length_encoding) {
+      out << "VarInt Run-Length Encoding\n";
+    } else if (c_ctx.stream_encoding) {
+      out << "VarInt Stream Encoding\n";
+    } else {
+      out << "VarInt Encoding\n";
+    }
+    out << "  High Degree Encoding:       " << (c_ctx.high_degree_encoding ? "yes" : "no") << "\n";
+    if (c_ctx.high_degree_encoding) {
+      out << "    Threshold:                " << c_ctx.high_degree_threshold << "\n";
+      out << "    Part Length:              " << c_ctx.high_degree_part_length << "\n";
+    }
+    out << "  Interval Encoding:          " << (c_ctx.interval_encoding ? "yes" : "no") << "\n";
+    if (c_ctx.interval_encoding) {
+      out << "    Length Threshold:         " << c_ctx.interval_length_treshold << "\n";
+    }
+    out << "  Isolated Nodes Separation:  " << (c_ctx.isolated_nodes_separation ? "yes" : "no")
+        << "\n";
+
+    out << "Compresion Ratio:             ";
+    if (c_ctx.dismissed) {
+      out << "<1 (dismissed)\n";
+    } else {
+      out << c_ctx.compression_ratio
+          << " [size reduction: " << (c_ctx.size_reduction / (float)(1024 * 1024)) << " mb]"
+          << "\n";
+      out << "  High Degree Count:          " << c_ctx.high_degree_count << "\n";
+      out << "  Part Count:                 " << c_ctx.part_count << "\n";
+      out << "  Interval Count:             " << c_ctx.interval_count << "\n";
+
+      if (debug::kTrackVarintStats) {
+        const auto &stats = debug::varint_stats_global();
+
+        const float avg_varint_len =
+            (stats.varint_count == 0) ? 0 : (stats.varint_bytes / (float)stats.varint_count);
+        out << "Average Varint Length:        " << avg_varint_len
+            << " [count: " << stats.varint_count << "]\n";
+
+        const float avg_signed_varint_len =
+            (stats.signed_varint_count == 0)
+                ? 0
+                : (stats.signed_varint_bytes / (float)stats.signed_varint_count);
+        out << "Average Signed Varint Length: " << avg_signed_varint_len
+            << " [count: " << stats.signed_varint_count << "]\n";
+
+        const float avg_marked_varint_len =
+            (stats.marked_varint_count == 0)
+                ? 0
+                : (stats.marked_varint_bytes / (float)stats.marked_varint_count);
+        out << "Average Marked Varint Length: " << avg_marked_varint_len
+            << " [count: " << stats.marked_varint_count << "]\n";
+      }
+    }
+  }
+}
+
+std::ostream &operator<<(std::ostream &out, const ContractionMode mode) {
+  switch (mode) {
+  case ContractionMode::BUFFERED:
+    return out << "buffered";
+  case ContractionMode::BUFFERED_LEGACY:
+    return out << "buffered-legacy";
+  case ContractionMode::UNBUFFERED:
+    return out << "unbuffered";
+  case ContractionMode::UNBUFFERED_NAIVE:
+    return out << "unbuffered-naive";
+  }
+
+  return out << "<invalid>";
+}
+
+std::unordered_map<std::string, ContractionMode> get_contraction_modes() {
+  return {
+      {"buffered", ContractionMode::BUFFERED},
+      {"buffered-legacy", ContractionMode::BUFFERED_LEGACY},
+      {"unbuffered", ContractionMode::UNBUFFERED},
+      {"unbuffered-naive", ContractionMode::UNBUFFERED_NAIVE},
+  };
+}
+
 void print(const CoarseningContext &c_ctx, std::ostream &out) {
   out << "Contraction limit:            " << c_ctx.contraction_limit << "\n";
-  out << "Cluster weight limit:         " << c_ctx.cluster_weight_limit << " x "
-      << c_ctx.cluster_weight_multiplier << "\n";
-  out << "Clustering algorithm:         " << c_ctx.algorithm << "\n";
-  if (c_ctx.algorithm == ClusteringAlgorithm::LABEL_PROPAGATION) {
-    print(c_ctx.lp, out);
+  out << "Coarsening algorithm:         " << c_ctx.algorithm << "\n";
+
+  if (c_ctx.algorithm == CoarseningAlgorithm::CLUSTERING) {
+    out << "  Cluster weight limit:       " << c_ctx.clustering.cluster_weight_limit << " x "
+        << c_ctx.clustering.cluster_weight_multiplier << "\n";
+    out << "  Max mem-free level:         " << c_ctx.clustering.max_mem_free_coarsening_level
+        << "\n";
+    out << "  Clustering algorithm:       " << c_ctx.clustering.algorithm << "\n";
+    if (c_ctx.clustering.algorithm == ClusteringAlgorithm::LABEL_PROPAGATION ||
+        c_ctx.clustering.algorithm == ClusteringAlgorithm::LEGACY_LABEL_PROPAGATION) {
+      print(c_ctx.clustering.lp, out);
+    }
+  }
+
+  out << "Contraction mode:             " << c_ctx.contraction.mode << '\n';
+  out << "  Mapping type:               "
+      << (c_ctx.contraction.use_compact_mapping ? "compact" : "normal") << '\n';
+  if (c_ctx.contraction.mode == ContractionMode::BUFFERED) {
+    out << "  Edge buffer fill fraction:  " << c_ctx.contraction.edge_buffer_fill_fraction << "\n";
   }
 }
 
 void print(const LabelPropagationCoarseningContext &lp_ctx, std::ostream &out) {
-  out << "  Number of iterations:       " << lp_ctx.num_iterations << "\n";
-  out << "  High degree threshold:      " << lp_ctx.large_degree_threshold << "\n";
-  out << "  Max degree:                 " << lp_ctx.max_num_neighbors << "\n";
-  out << "  2-hop clustering:           " << lp_ctx.two_hop_strategy << ", if |Vcoarse| > "
+  out << "    Number of iterations:     " << lp_ctx.num_iterations << "\n";
+  out << "    High degree threshold:    " << lp_ctx.large_degree_threshold << "\n";
+  out << "    Max degree:               " << lp_ctx.max_num_neighbors << "\n";
+  out << "    Two-level weight vector:  "
+      << (lp_ctx.use_two_level_cluster_weight_vector ?
+#ifdef KAMINPAR_USES_GROWT
+                                                     "yes (growt)"
+#else
+                                                     "yes (tbb)"
+#endif
+                                                     : "no")
+      << "\n";
+  out << "    Uses two phases:          " << (lp_ctx.use_two_phases ? "yes" : "no") << "\n";
+  if (lp_ctx.use_two_phases) {
+    out << "      Select mode:            " << lp_ctx.second_phase_select_mode << '\n';
+    out << "      Aggregation mode:       " << lp_ctx.second_phase_aggregation_mode << '\n';
+    out << "      Relabel:                " << (lp_ctx.relabel_before_second_phase ? "yes" : "no")
+        << '\n';
+  }
+  out << "    2-hop clustering:         " << lp_ctx.two_hop_strategy << ", if |Vcoarse| > "
       << std::setw(2) << std::fixed << lp_ctx.two_hop_threshold << " * |V|\n";
-  out << "  Isolated nodes:             " << lp_ctx.isolated_nodes_strategy << "\n";
+  out << "    Isolated nodes:           " << lp_ctx.isolated_nodes_strategy << "\n";
 }
 
 void print(const InitialPartitioningContext &i_ctx, std::ostream &out) {
@@ -282,6 +483,11 @@ void print(const RefinementContext &r_ctx, std::ostream &out) {
   if (r_ctx.includes_algorithm(RefinementAlgorithm::LABEL_PROPAGATION)) {
     out << "Label propagation:\n";
     out << "  Number of iterations:       " << r_ctx.lp.num_iterations << "\n";
+    out << "  Uses two phases: " << (r_ctx.lp.use_two_phases ? "yes" : "no") << "\n";
+    if (r_ctx.lp.use_two_phases) {
+      out << "    Select mode:              " << r_ctx.lp.second_phase_select_mode << '\n';
+      out << "    Aggregation mode:         " << r_ctx.lp.second_phase_aggregation_mode << '\n';
+    }
   }
   if (r_ctx.includes_algorithm(RefinementAlgorithm::KWAY_FM)) {
     out << "k-way FM:\n";
@@ -348,8 +554,11 @@ void print(const Context &ctx, std::ostream &out) {
   out << "Execution mode:               " << ctx.parallel.num_threads << "\n";
   out << "Seed:                         " << Random::get_seed() << "\n";
   out << "Graph:                        " << ctx.debug.graph_name
-      << " [ordering: " << ctx.rearrange_by << "]\n";
+      << " [node ordering: " << ctx.node_ordering << "]" << " [edge ordering: " << ctx.edge_ordering
+      << "]\n";
   print(ctx.partition, out);
+  cio::print_delimiter("Graph Compression", '-');
+  print(ctx.compression, out);
   cio::print_delimiter("Partitioning Scheme", '-');
   print(ctx.partitioning, out);
   cio::print_delimiter("Coarsening", '-');
diff --git a/kaminpar-shm/context_io.h b/kaminpar-shm/context_io.h
index 5f713b58..eaf90a78 100644
--- a/kaminpar-shm/context_io.h
+++ b/kaminpar-shm/context_io.h
@@ -13,9 +13,17 @@
 #include "kaminpar-shm/kaminpar.h"
 
 namespace kaminpar::shm {
-std::ostream &operator<<(std::ostream &out, GraphOrdering ordering);
+std::ostream &operator<<(std::ostream &out, NodeOrdering ordering);
 
-std::unordered_map<std::string, GraphOrdering> get_graph_orderings();
+std::unordered_map<std::string, NodeOrdering> get_node_orderings();
+
+std::ostream &operator<<(std::ostream &out, EdgeOrdering ordering);
+
+std::unordered_map<std::string, EdgeOrdering> get_edge_orderings();
+
+std::ostream &operator<<(std::ostream &out, CoarseningAlgorithm algorithm);
+
+std::unordered_map<std::string, CoarseningAlgorithm> get_coarsening_algorithms();
 
 std::ostream &operator<<(std::ostream &out, ClusteringAlgorithm algorithm);
 
@@ -43,6 +51,14 @@ std::unordered_map<std::string, InitialPartitioningMode> get_initial_partitionin
 
 std::ostream &operator<<(std::ostream &out, GainCacheStrategy strategy);
 
+std::ostream &operator<<(std::ostream &out, SecondPhaseSelectMode strategy);
+
+std::unordered_map<std::string, SecondPhaseSelectMode> get_second_phase_select_modes();
+
+std::ostream &operator<<(std::ostream &out, SecondPhaseAggregationMode strategy);
+
+std::unordered_map<std::string, SecondPhaseAggregationMode> get_second_phase_aggregation_modes();
+
 std::unordered_map<std::string, GainCacheStrategy> get_gain_cache_strategies();
 
 std::ostream &operator<<(std::ostream &out, TwoHopStrategy strategy);
@@ -51,9 +67,15 @@ std::unordered_map<std::string, TwoHopStrategy> get_two_hop_strategies();
 
 std::ostream &operator<<(std::ostream &out, IsolatedNodesClusteringStrategy strategy);
 
-std::unordered_map<std::string, IsolatedNodesClusteringStrategy> get_isolated_nodes_clustering_strategies();
+std::unordered_map<std::string, IsolatedNodesClusteringStrategy>
+get_isolated_nodes_clustering_strategies();
+
+std::ostream &operator<<(std::ostream &out, const ContractionMode mode);
+
+std::unordered_map<std::string, ContractionMode> get_contraction_modes();
 
 void print(const Context &ctx, std::ostream &out);
+void print(const GraphCompressionContext &c_ctx, std::ostream &out);
 void print(const PartitioningContext &p_ctx, std::ostream &out);
 void print(const PartitionContext &p_ctx, std::ostream &out);
 void print(const RefinementContext &r_ctx, std::ostream &out);
diff --git a/kaminpar-shm/datastructures/abstract_graph.h b/kaminpar-shm/datastructures/abstract_graph.h
new file mode 100644
index 00000000..3f6b69e8
--- /dev/null
+++ b/kaminpar-shm/datastructures/abstract_graph.h
@@ -0,0 +1,71 @@
+/*******************************************************************************
+ * Abstract interface for a graph data structure.
+ *
+ * @file:   abstract_graph.h
+ * @author: Daniel Seemaier
+ * @date:   17.11.2023
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/ranges.h"
+
+namespace kaminpar::shm {
+class AbstractGraph {
+public:
+  // Data types used by this graph
+  using NodeID = ::kaminpar::shm::NodeID;
+  using NodeWeight = ::kaminpar::shm::NodeWeight;
+  using EdgeID = ::kaminpar::shm::EdgeID;
+  using EdgeWeight = ::kaminpar::shm::EdgeWeight;
+
+  AbstractGraph() = default;
+
+  AbstractGraph(const AbstractGraph &) = delete;
+  AbstractGraph &operator=(const AbstractGraph &) = delete;
+
+  AbstractGraph(AbstractGraph &&) noexcept = default;
+  AbstractGraph &operator=(AbstractGraph &&) noexcept = default;
+
+  virtual ~AbstractGraph() = default;
+
+  // Size of the graph
+  [[nodiscard]] virtual NodeID n() const = 0;
+  [[nodiscard]] virtual EdgeID m() const = 0;
+
+  // Node and edge weights
+  [[nodiscard]] virtual bool node_weighted() const = 0;
+  [[nodiscard]] virtual NodeWeight node_weight(NodeID u) const = 0;
+  [[nodiscard]] virtual NodeWeight max_node_weight() const = 0;
+  [[nodiscard]] virtual NodeWeight total_node_weight() const = 0;
+
+  [[nodiscard]] virtual bool edge_weighted() const = 0;
+  [[nodiscard]] virtual EdgeWeight edge_weight(EdgeID e) const = 0;
+  [[nodiscard]] virtual EdgeWeight total_edge_weight() const = 0;
+
+  // Low-level access to the graph structure
+  [[nodiscard]] virtual NodeID max_degree() const = 0;
+  [[nodiscard]] virtual NodeID degree(NodeID u) const = 0;
+
+  // Iterators for nodes / edges
+  [[nodiscard]] virtual IotaRange<NodeID> nodes() const = 0;
+  [[nodiscard]] virtual IotaRange<EdgeID> edges() const = 0;
+
+  // Graph permutation
+  virtual void set_permutation(StaticArray<NodeID> permutation) = 0;
+  [[nodiscard]] virtual bool permuted() const = 0;
+  [[nodiscard]] virtual NodeID map_original_node(NodeID u) const = 0;
+  [[nodiscard]] virtual StaticArray<NodeID> &&take_raw_permutation() = 0;
+
+  // Degree buckets
+  [[nodiscard]] virtual std::size_t bucket_size(std::size_t bucket) const = 0;
+  [[nodiscard]] virtual NodeID first_node_in_bucket(std::size_t bucket) const = 0;
+  [[nodiscard]] virtual NodeID first_invalid_node_in_bucket(std::size_t bucket) const = 0;
+  [[nodiscard]] virtual std::size_t number_of_buckets() const = 0;
+  [[nodiscard]] virtual bool sorted() const = 0;
+
+  virtual void update_total_node_weight() = 0;
+};
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/compressed_graph.cc b/kaminpar-shm/datastructures/compressed_graph.cc
new file mode 100644
index 00000000..5091ac46
--- /dev/null
+++ b/kaminpar-shm/datastructures/compressed_graph.cc
@@ -0,0 +1,582 @@
+/*******************************************************************************
+ * Compressed static graph representations.
+ *
+ * @file:   compressed_graph.cc
+ * @author: Daniel Salwasser
+ * @date:   01.12.2023
+ ******************************************************************************/
+#include "compressed_graph.h"
+
+#include <kassert/kassert.hpp>
+
+#include "kaminpar-common/heap_profiler.h"
+
+namespace kaminpar::shm {
+
+CompressedGraph::CompressedGraph(
+    CompactStaticArray<EdgeID> nodes,
+    StaticArray<std::uint8_t> compressed_edges,
+    StaticArray<NodeWeight> node_weights,
+    StaticArray<EdgeWeight> edge_weights,
+    EdgeID edge_count,
+    NodeID max_degree,
+    bool sorted,
+    std::size_t high_degree_count,
+    std::size_t part_count,
+    std::size_t interval_count
+)
+    : _nodes(std::move(nodes)),
+      _compressed_edges(std::move(compressed_edges)),
+      _node_weights(std::move(node_weights)),
+      _edge_weights(std::move(edge_weights)),
+      _edge_count(edge_count),
+      _max_degree(max_degree),
+      _sorted(sorted),
+      _high_degree_count(high_degree_count),
+      _part_count(part_count),
+      _interval_count(interval_count) {
+  KASSERT(kHighDegreeEncoding || _high_degree_count == 0);
+  KASSERT(kHighDegreeEncoding || _part_count == 0);
+  KASSERT(kIntervalEncoding || interval_count == 0);
+
+  if (_node_weights.empty()) {
+    _total_node_weight = static_cast<NodeWeight>(n());
+    _max_node_weight = 1;
+  } else {
+    _total_node_weight =
+        std::accumulate(_node_weights.begin(), _node_weights.end(), static_cast<NodeWeight>(0));
+    _max_node_weight = *std::max_element(_node_weights.begin(), _node_weights.end());
+  }
+
+  if (_edge_weights.empty()) {
+    _total_edge_weight = static_cast<EdgeWeight>(m());
+  } else {
+    _total_edge_weight =
+        std::accumulate(_edge_weights.begin(), _edge_weights.end(), static_cast<EdgeWeight>(0));
+  }
+
+  init_degree_buckets();
+};
+
+void CompressedGraph::init_degree_buckets() {
+  KASSERT(std::all_of(_buckets.begin(), _buckets.end(), [](const auto n) { return n == 0; }));
+
+  if (sorted()) {
+    for (const NodeID u : nodes()) {
+      ++_buckets[degree_bucket(degree(u)) + 1];
+    }
+    auto last_nonempty_bucket =
+        std::find_if(_buckets.rbegin(), _buckets.rend(), [](const auto n) { return n > 0; });
+    _number_of_buckets = std::distance(_buckets.begin(), (last_nonempty_bucket + 1).base());
+  } else {
+    _buckets[1] = n();
+    _number_of_buckets = 1;
+  }
+
+  std::partial_sum(_buckets.begin(), _buckets.end(), _buckets.begin());
+}
+
+void CompressedGraph::update_total_node_weight() {
+  if (_node_weights.empty()) {
+    _total_node_weight = n();
+    _max_node_weight = 1;
+  } else {
+    _total_node_weight =
+        std::accumulate(_node_weights.begin(), _node_weights.end(), static_cast<NodeWeight>(0));
+    _max_node_weight = *std::max_element(_node_weights.begin(), _node_weights.end());
+  }
+}
+
+void CompressedGraph::remove_isolated_nodes(const NodeID isolated_nodes) {
+  KASSERT(sorted());
+
+  if (isolated_nodes == 0) {
+    return;
+  }
+
+  const NodeID new_n = n() - isolated_nodes;
+  _nodes.restrict(new_n + 1);
+  if (!_node_weights.empty()) {
+    _node_weights.restrict(new_n);
+  }
+
+  update_total_node_weight();
+
+  // Update degree buckets
+  for (std::size_t i = 0; i < _buckets.size() - 1; ++i) {
+    _buckets[1 + i] -= isolated_nodes;
+  }
+
+  // If the graph has only isolated nodes then there are no buckets afterwards
+  if (_number_of_buckets == 1) {
+    _number_of_buckets = 0;
+  }
+}
+
+void CompressedGraph::integrate_isolated_nodes() {
+  KASSERT(sorted());
+
+  const NodeID nonisolated_nodes = n();
+  _nodes.unrestrict();
+  _node_weights.unrestrict();
+
+  const NodeID isolated_nodes = n() - nonisolated_nodes;
+  update_total_node_weight();
+
+  // Update degree buckets
+  for (std::size_t i = 0; i < _buckets.size() - 1; ++i) {
+    _buckets[1 + i] += isolated_nodes;
+  }
+
+  // If the graph has only isolated nodes then there is one afterwards
+  if (_number_of_buckets == 0) {
+    _number_of_buckets = 1;
+  }
+}
+
+std::size_t CompressedGraphBuilder::compressed_edge_array_max_size(
+    const NodeID node_count, const EdgeID edge_count
+) {
+  std::size_t max_size =
+      node_count * varint_max_length<EdgeID>() + 2 * edge_count * varint_max_length<NodeID>();
+
+  if constexpr (CompressedGraph::kHighDegreeEncoding) {
+    if constexpr (CompressedGraph::kIntervalEncoding) {
+      max_size += 2 * node_count * varint_max_length<NodeID>();
+    } else {
+      max_size += node_count * varint_max_length<NodeID>();
+    }
+
+    max_size += (edge_count / CompressedGraph::kHighDegreePartLength) * varint_max_length<NodeID>();
+  }
+
+  return max_size;
+}
+
+CompressedGraph CompressedGraphBuilder::compress(const CSRGraph &graph) {
+  const bool store_node_weights = graph.node_weighted();
+  const bool store_edge_weights = graph.edge_weighted();
+
+  CompressedGraphBuilder builder;
+  builder.init(graph.n(), graph.m(), store_node_weights, store_edge_weights, graph.sorted());
+
+  std::vector<std::pair<NodeID, EdgeWeight>> neighbourhood;
+  neighbourhood.reserve(graph.max_degree());
+
+  for (const NodeID node : graph.nodes()) {
+    if (store_node_weights) {
+      builder.set_node_weight(node, graph.node_weight(node));
+    }
+
+    for (const auto [incident_edge, adjacent_node] : graph.neighbors(node)) {
+      neighbourhood.push_back(std::make_pair(adjacent_node, graph.edge_weight(incident_edge)));
+    }
+
+    builder.add_node(node, neighbourhood);
+    neighbourhood.clear();
+  }
+
+  return builder.build();
+}
+
+void CompressedGraphBuilder::init(
+    const NodeID node_count,
+    const EdgeID edge_count,
+    const bool store_node_weights,
+    const bool store_edge_weights,
+    const bool sorted
+) {
+  KASSERT(node_count != std::numeric_limits<NodeID>::max() - 1);
+
+  const std::size_t max_size = compressed_edge_array_max_size(node_count, edge_count);
+  _nodes.resize(math::byte_width(max_size), node_count + 1);
+
+  if (store_node_weights) {
+    _node_weights.resize(node_count);
+  }
+
+  if (store_edge_weights) {
+    _edge_weights.resize(edge_count * 2);
+  }
+
+  _store_node_weights = store_node_weights;
+  _store_edge_weights = store_edge_weights;
+
+  _total_node_weight = 0;
+  _total_edge_weight = 0;
+
+  _sorted = sorted;
+
+  if constexpr (kHeapProfiling) {
+    // As we overcommit memory do not track the amount of bytes used directly. Instead record it
+    // manually when building the compressed graph.
+    _compressed_edges = (uint8_t *)heap_profiler::std_malloc(max_size);
+  } else {
+    _compressed_edges = (uint8_t *)malloc(max_size);
+  }
+  _cur_compressed_edges = _compressed_edges;
+
+  _edge_count = 0;
+  _max_degree = 0;
+
+  _first_isolated_node = true;
+  _last_real_edge = 0;
+
+  _high_degree_count = 0;
+  _part_count = 0;
+  _interval_count = 0;
+}
+
+void CompressedGraphBuilder::add_node(
+    const NodeID node, std::vector<std::pair<NodeID, EdgeWeight>> &neighbourhood
+) {
+  // Store the index into the compressed edge array of the start of the neighbourhood of the node
+  // in its entry in the node array.
+  _nodes.write(node, static_cast<EdgeID>(_cur_compressed_edges - _compressed_edges));
+
+  const NodeID degree = neighbourhood.size();
+  if (degree == 0) {
+    // If isolated nodes are continuously stored at the end of the nodes array, gap encoding for the
+    // first edge id with respect to the source node can be used while determining the degree
+    // through the first edge id. For this to work, at the first isolated node (or at the end of the
+    // edge array if no isolated node exists, see build-method) we have to store the (effective)
+    // first edge id of the isolated node as a gap with respect to the isolated node. Further, the
+    // index in the node array of all following isolated nodes have to be shifted such that they are
+    // seen as isolated nodes.
+    if constexpr (CompressedGraph::kIsolatedNodesSeparation) {
+      if (_first_isolated_node) {
+        _first_isolated_node = false;
+        _last_real_edge = static_cast<EdgeID>(_cur_compressed_edges - _compressed_edges);
+
+        const EdgeID first_edge_gap = _edge_count - node;
+        if constexpr (CompressedGraph::kIntervalEncoding) {
+          _cur_compressed_edges +=
+              marked_varint_encode(first_edge_gap, false, _cur_compressed_edges);
+        } else {
+          _cur_compressed_edges += varint_encode(first_edge_gap, _cur_compressed_edges);
+        }
+      } else {
+        _nodes.write(node, _last_real_edge);
+      }
+    }
+
+    return;
+  }
+
+  KASSERT(!CompressedGraph::kIsolatedNodesSeparation || _first_isolated_node);
+  _max_degree = std::max(_max_degree, degree);
+
+  // Store a pointer to the first byte of the first edge in the compressed edge array which encodes
+  // in one of its bits whether interval encoding is used for this node, i.e. whether the nodes has
+  // intervals in its neighbourhood.
+  std::uint8_t *marked_byte = _cur_compressed_edges;
+
+  // Store only the first edge for the source node. The degree can be obtained from determining the
+  // difference between the first edge ids of a node and the next node. Additionally, store the
+  // first edge as a gap when the isolated nodes are continuously stored at the end of the nodes
+  // array.
+  const EdgeID first_edge = _edge_count;
+  if constexpr (CompressedGraph::kIsolatedNodesSeparation) {
+    const EdgeID first_edge_gap = _edge_count - node;
+
+    if constexpr (CompressedGraph::kIntervalEncoding) {
+      _cur_compressed_edges += marked_varint_encode(first_edge_gap, false, _cur_compressed_edges);
+    } else {
+      _cur_compressed_edges += varint_encode(first_edge_gap, _cur_compressed_edges);
+    }
+  } else {
+    if constexpr (CompressedGraph::kIntervalEncoding) {
+      _cur_compressed_edges += marked_varint_encode(first_edge, false, _cur_compressed_edges);
+    } else {
+      _cur_compressed_edges += varint_encode(first_edge, _cur_compressed_edges);
+    }
+  }
+
+  // Only increment the edge count if edge weights are not stored as otherwise the edge count is
+  // incremented with each edge weight being added.
+  if (!_store_edge_weights) {
+    _edge_count += degree;
+  }
+
+  // Sort the adjacent nodes in ascending order.
+  std::sort(neighbourhood.begin(), neighbourhood.end(), [](const auto &a, const auto &b) {
+    return a.first < b.first;
+  });
+
+  if constexpr (CompressedGraph::kHighDegreeEncoding) {
+    const bool split_neighbourhood = degree >= CompressedGraph::kHighDegreeThreshold;
+
+    if (split_neighbourhood) {
+      const NodeID part_count = math::div_ceil(degree, CompressedGraph::kHighDegreePartLength);
+      const NodeID last_part_length = ((degree % CompressedGraph::kHighDegreePartLength) == 0)
+                                          ? CompressedGraph::kHighDegreePartLength
+                                          : (degree % CompressedGraph::kHighDegreePartLength);
+
+      uint8_t *part_ptr = _cur_compressed_edges;
+      _cur_compressed_edges += sizeof(NodeID) * part_count;
+
+      for (NodeID i = 0; i < part_count; ++i) {
+        auto part_begin = neighbourhood.begin() + i * CompressedGraph::kHighDegreePartLength;
+        const NodeID part_length =
+            (i + 1 == part_count) ? last_part_length : CompressedGraph::kHighDegreePartLength;
+
+        std::uint8_t *cur_part_ptr = part_ptr + sizeof(NodeID) * i;
+        *((NodeID *)cur_part_ptr) = static_cast<NodeID>(_cur_compressed_edges - part_ptr);
+
+        std::vector<std::pair<NodeID, EdgeWeight>> part_neighbourhood(
+            part_begin, part_begin + part_length
+        );
+        add_edges(node, nullptr, part_neighbourhood);
+      }
+
+      _part_count += part_count;
+      _high_degree_count += 1;
+      return;
+    }
+  }
+
+  add_edges(node, marked_byte, neighbourhood);
+}
+
+void CompressedGraphBuilder::set_node_weight(const NodeID node, const NodeWeight weight) {
+  KASSERT(_store_node_weights);
+
+  _total_node_weight += weight;
+  _node_weights[node] = weight;
+}
+
+CompressedGraph CompressedGraphBuilder::build() {
+  // Store in the last entry of the node array the index into the compressed edge array one after
+  // the last byte belonging to the last node.
+  _nodes.write(_nodes.size() - 1, static_cast<EdgeID>(_cur_compressed_edges - _compressed_edges));
+
+  // Store at the end of the compressed edge array the (gap of the) edge id of the last edge such
+  // that the degree of the last node can be computed from the difference between the last two first
+  // edge ids.
+  const EdgeID last_edge = _edge_count;
+  if constexpr (CompressedGraph::kIsolatedNodesSeparation) {
+    if (_first_isolated_node) {
+      const EdgeID last_edge_gap = last_edge - (_nodes.size() - 1);
+
+      if constexpr (CompressedGraph::kIntervalEncoding) {
+        _cur_compressed_edges += marked_varint_encode(last_edge_gap, false, _cur_compressed_edges);
+      } else {
+        _cur_compressed_edges += varint_encode(last_edge_gap, _cur_compressed_edges);
+      }
+    } else {
+      _nodes.write(_nodes.size() - 1, _last_real_edge);
+    }
+  } else {
+    if constexpr (CompressedGraph::kIntervalEncoding) {
+      _cur_compressed_edges += marked_varint_encode(last_edge, false, _cur_compressed_edges);
+    } else {
+      _cur_compressed_edges += varint_encode(last_edge, _cur_compressed_edges);
+    }
+  }
+
+  // Add an additional 15 bytes to the compressed edge array when stream encoding is enabled to
+  // avoid a possible segmentation fault as the stream decoder reads in 16-byte chunks.
+  if constexpr (CompressedGraph::kStreamEncoding) {
+    _cur_compressed_edges += 15;
+  }
+
+  const std::size_t stored_bytes =
+      static_cast<std::size_t>(_cur_compressed_edges - _compressed_edges);
+  RECORD("compressed_edges")
+  StaticArray<std::uint8_t> compressed_edges(_compressed_edges, stored_bytes);
+
+  if constexpr (kHeapProfiling) {
+    heap_profiler::HeapProfiler::global().record_alloc(_compressed_edges, stored_bytes);
+  }
+
+  const bool unit_node_weights = static_cast<NodeID>(_total_node_weight + 1) == _nodes.size();
+  if (unit_node_weights) {
+    _node_weights.free();
+  }
+
+  const bool unit_edge_weights = static_cast<EdgeID>(_total_edge_weight) == _edge_count;
+  if (unit_edge_weights) {
+    _edge_weights.free();
+  }
+
+  return CompressedGraph(
+      std::move(_nodes),
+      std::move(compressed_edges),
+      std::move(_node_weights),
+      std::move(_edge_weights),
+      _edge_count,
+      _max_degree,
+      _sorted,
+      _high_degree_count,
+      _part_count,
+      _interval_count
+  );
+}
+
+std::size_t CompressedGraphBuilder::edge_array_size() const {
+  return static_cast<std::size_t>(_cur_compressed_edges - _compressed_edges);
+}
+
+std::int64_t CompressedGraphBuilder::total_node_weight() const {
+  return _total_node_weight;
+}
+
+std::int64_t CompressedGraphBuilder::total_edge_weight() const {
+  return _total_edge_weight;
+}
+
+void CompressedGraphBuilder::add_edges(
+    const NodeID node,
+    std::uint8_t *marked_byte,
+    std::vector<std::pair<NodeID, EdgeWeight>> &neighbourhood
+) {
+  const auto store_edge_weight = [&](const EdgeWeight edge_weight) {
+    _total_edge_weight += edge_weight;
+    _edge_weights[_edge_count++] = edge_weight;
+  };
+
+  NodeID neighbour_count = neighbourhood.size();
+
+  // Find intervals [i, j] of consecutive adjacent nodes i, i + 1, ..., j - 1, j of length at
+  // least kIntervalLengthTreshold. Instead of storing all nodes, only store a representation of
+  // the left extreme i and the length j - i + 1. Left extremes are compressed using the
+  // differences between each left extreme and the previous right extreme minus 2 (because there
+  // must be at least one integer between the end of an interval and the beginning of the next
+  // one), except the first left extreme which is stored directly. The lengths are decremented by
+  // kIntervalLengthTreshold, the minimum length of an interval.
+  if constexpr (CompressedGraph::kIntervalEncoding) {
+    NodeID interval_count = 0;
+
+    // Store the pointer to the interval count and skip the amount of bytes needed to store the
+    // interval count as we can only determine the amount of intervals after finding all of
+    // them.
+    std::uint8_t *interval_count_ptr = _cur_compressed_edges;
+    _cur_compressed_edges += sizeof(NodeID);
+
+    if (neighbourhood.size() >= CompressedGraph::kIntervalLengthTreshold) {
+      NodeID interval_len = 1;
+      NodeID previous_right_extreme = 2;
+      NodeID prev_adjacent_node = (*neighbourhood.begin()).first;
+
+      for (auto iter = neighbourhood.begin() + 1; iter != neighbourhood.end(); ++iter) {
+        const NodeID adjacent_node = (*iter).first;
+
+        if (prev_adjacent_node + 1 == adjacent_node) {
+          interval_len++;
+
+          // The interval ends if there are no more nodes or the next node is not the increment of
+          // the current node.
+          if (iter + 1 == neighbourhood.end() || (*(iter + 1)).first != adjacent_node + 1) {
+            if (interval_len >= CompressedGraph::kIntervalLengthTreshold) {
+              const NodeID left_extreme = adjacent_node + 1 - interval_len;
+              const NodeID left_extreme_gap = left_extreme + 2 - previous_right_extreme;
+              const NodeID interval_length_gap =
+                  interval_len - CompressedGraph::kIntervalLengthTreshold;
+
+              _cur_compressed_edges += varint_encode(left_extreme_gap, _cur_compressed_edges);
+              _cur_compressed_edges += varint_encode(interval_length_gap, _cur_compressed_edges);
+
+              for (NodeID i = 0; i < interval_len; ++i) {
+                std::pair<NodeID, EdgeWeight> &incident_edge = *(iter + 1 + i - interval_len);
+
+                // Set the adjacent node to the max id to indicate for the gap encoding part that
+                // the node has been encoded through an interval.
+                incident_edge.first = std::numeric_limits<NodeID>::max();
+
+                if (_store_edge_weights) {
+                  store_edge_weight(incident_edge.second);
+                }
+              }
+
+              previous_right_extreme = adjacent_node;
+
+              neighbour_count -= interval_len;
+              interval_count += 1;
+            }
+
+            interval_len = 1;
+          }
+        }
+
+        prev_adjacent_node = adjacent_node;
+      }
+    }
+
+    // If intervals have been encoded store the interval count and set the bit in the marked byte
+    // indicating that interval encoding has been used for the neighbourhood if the marked byte is
+    // given. Otherwise, fix the amount of bytes stored as we don't store the interval count if no
+    // intervals have been encoded.
+    if (marked_byte == nullptr) {
+      *((NodeID *)interval_count_ptr) = interval_count;
+    } else if (interval_count > 0) {
+      *((NodeID *)interval_count_ptr) = interval_count;
+      *marked_byte |= 0b01000000;
+    } else {
+      _cur_compressed_edges -= sizeof(NodeID);
+    }
+
+    if (interval_count > 0) {
+      _interval_count += 1;
+    }
+
+    // If all incident edges have been compressed using intervals then gap encoding cannot be
+    // applied.
+    if (neighbour_count == 0) {
+      return;
+    }
+  }
+
+  // Store the remaining adjacent nodes using gap encoding. That is instead of directly storing the
+  // nodes v_1, v_2, ..., v_{k - 1}, v_k, store the gaps v_1 - u, v_2 - v_1 - 1, ..., v_k - v_{k -
+  // 1} - 1 between the nodes, where u is the source node. Note that all gaps except the first one
+  // have to be positive as we sorted the nodes in ascending order. Thus, only for the first gap
+  // the sign is additionally stored.
+  auto iter = neighbourhood.begin();
+
+  // Go to the first adjacent node that has not been encoded through an interval.
+  while ((*iter).first == std::numeric_limits<NodeID>::max()) {
+    ++iter;
+  }
+
+  const auto [first_adjacent_node, first_edge_weight] = *iter++;
+  const SignedID first_gap = first_adjacent_node - static_cast<SignedID>(node);
+  _cur_compressed_edges += signed_varint_encode(first_gap, _cur_compressed_edges);
+
+  if (_store_edge_weights) {
+    store_edge_weight(first_edge_weight);
+  }
+
+  VarIntRunLengthEncoder<NodeID> rl_encoder(_cur_compressed_edges);
+  VarIntStreamEncoder<NodeID> sv_encoder(_cur_compressed_edges, neighbour_count - 1);
+
+  NodeID prev_adjacent_node = first_adjacent_node;
+  while (iter != neighbourhood.end()) {
+    const auto [adjacent_node, edge_weight] = *iter++;
+    if (adjacent_node == std::numeric_limits<NodeID>::max()) {
+      continue;
+    }
+
+    const NodeID gap = adjacent_node - prev_adjacent_node - 1;
+    if constexpr (CompressedGraph::kRunLengthEncoding) {
+      _cur_compressed_edges += rl_encoder.add(gap);
+    } else if constexpr (CompressedGraph::kStreamEncoding) {
+      _cur_compressed_edges += sv_encoder.add(gap);
+    } else {
+      _cur_compressed_edges += varint_encode(gap, _cur_compressed_edges);
+    }
+
+    if (_store_edge_weights) {
+      store_edge_weight(edge_weight);
+    }
+
+    prev_adjacent_node = adjacent_node;
+  }
+
+  if constexpr (CompressedGraph::kRunLengthEncoding) {
+    rl_encoder.flush();
+  } else if constexpr (CompressedGraph::kStreamEncoding) {
+    sv_encoder.flush();
+  }
+}
+
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/compressed_graph.h b/kaminpar-shm/datastructures/compressed_graph.h
new file mode 100644
index 00000000..e7a21cc8
--- /dev/null
+++ b/kaminpar-shm/datastructures/compressed_graph.h
@@ -0,0 +1,851 @@
+/*******************************************************************************
+ * Compressed static graph representations.
+ *
+ * @file:   compressed_graph.h
+ * @author: Daniel Salwasser
+ * @date:   07.11.2023
+ ******************************************************************************/
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include <tbb/parallel_for.h>
+
+#include "kaminpar-shm/datastructures/abstract_graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
+
+#include "kaminpar-common/constexpr_utils.h"
+#include "kaminpar-common/math.h"
+#include "kaminpar-common/ranges.h"
+#include "kaminpar-common/varint_codec.h"
+#include "kaminpar-common/varint_run_length_codec.h"
+#include "kaminpar-common/varint_stream_codec.h"
+
+namespace kaminpar::shm {
+
+/*!
+ * A compressed static graph that stores the nodes and edges in a compressed adjacency array. It
+ * uses variable length encoding, gap encoding and interval encoding to compress the edge array.
+ */
+class CompressedGraph : public AbstractGraph {
+public:
+  using AbstractGraph::EdgeID;
+  using AbstractGraph::EdgeWeight;
+  using AbstractGraph::NodeID;
+  using AbstractGraph::NodeWeight;
+  using SignedID = std::int64_t;
+
+#ifdef KAMINPAR_COMPRESSION_HIGH_DEGREE_ENCODING
+  /*!
+   * Whether high degree encoding is used.
+   */
+  static constexpr bool kHighDegreeEncoding = true;
+#else
+  /*!
+   * Whether high degree encoding is used.
+   */
+  static constexpr bool kHighDegreeEncoding = false;
+#endif
+
+  /*!
+   * The minimum degree of a node to be considered high degree.
+   */
+  static constexpr NodeID kHighDegreeThreshold = 10000;
+
+  /*!
+   * The length of a part when splitting the neighbourhood of a high degree node.
+   */
+  static constexpr NodeID kHighDegreePartLength = 1000;
+
+#ifdef KAMINPAR_COMPRESSION_INTERVAL_ENCODING
+  /*!
+   * Whether interval encoding is used.
+   */
+  static constexpr bool kIntervalEncoding = true;
+#else
+  /*!
+   * Whether interval encoding is used.
+   */
+  static constexpr bool kIntervalEncoding = false;
+#endif
+
+  /*!
+   * The minimum length of an interval to encode if interval encoding is used.
+   */
+  static constexpr NodeID kIntervalLengthTreshold = 3;
+
+#ifdef KAMINPAR_COMPRESSION_RUN_LENGTH_ENCODING
+  /*!
+   * Whether run-length encoding is used.
+   */
+  static constexpr bool kRunLengthEncoding = true;
+#else
+  /*!
+   * Whether run-length encoding is used.
+   */
+  static constexpr bool kRunLengthEncoding = false;
+#endif
+
+#ifdef KAMINPAR_COMPRESSION_STREAM_ENCODING
+  /*!
+   * Whether stream encoding is used.
+   */
+  static constexpr bool kStreamEncoding = true;
+#else
+  /*!
+   * Whether stream encoding is used.
+   */
+  static constexpr bool kStreamEncoding = false;
+#endif
+
+  static_assert(
+      !kRunLengthEncoding || !kStreamEncoding,
+      "Either run-length or stream encoding can be used for varints but not both."
+  );
+
+#ifdef KAMINPAR_COMPRESSION_ISOLATED_NODES_SEPARATION
+  /*!
+   * Whether the isolated nodes of the compressed graph are continuously stored at the end of the
+   * nodes array.
+   */
+  static constexpr bool kIsolatedNodesSeparation = true;
+#else
+  /*!
+   * Whether the isolated nodes of the compressed graph are continuously stored at the end of the
+   * nodes array.
+   */
+  static constexpr bool kIsolatedNodesSeparation = false;
+#endif
+
+  /*!
+   * Constructs a new compressed graph.
+   *
+   * @param nodes The node array which stores for each node the offset in the compressed edges array
+   * of the first edge.
+   * @param compressed_edges The edge array which stores the edges for each node in a compressed
+   * format.
+   * @param node_weights The array of node weights in which the weights of each node in the
+   * respective entry are stored.
+   * @param edge_weights The array of edge weights in which the weights of each edge in the
+   * respective entry are stored.
+   * @param edge_count The number of edges stored in the compressed edge array.
+   * @param max_degree The maximum degree of the graph.
+   * @param sorted Whether the nodes are stored by deg-buckets order.
+   * @param high_degree_count The number of nodes which have high degree.
+   * @param part_count The number of parts that result from splitting the neighbourhood of high
+   * degree nodes.
+   * @param interval_count The number of nodes/parts which use interval encoding.
+   */
+  explicit CompressedGraph(
+      CompactStaticArray<EdgeID> nodes,
+      StaticArray<std::uint8_t> compressed_edges,
+      StaticArray<NodeWeight> node_weights,
+      StaticArray<EdgeWeight> edge_weights,
+      EdgeID edge_count,
+      NodeID max_degree,
+      bool sorted,
+      std::size_t high_degree_count,
+      std::size_t part_count,
+      std::size_t interval_count
+  );
+
+  CompressedGraph(const CompressedGraph &) = delete;
+  CompressedGraph &operator=(const CompressedGraph &) = delete;
+
+  CompressedGraph(CompressedGraph &&) noexcept = default;
+  CompressedGraph &operator=(CompressedGraph &&) noexcept = default;
+
+  template <typename Lambda> decltype(auto) reified(Lambda &&l) const {
+    return l(*this);
+  }
+
+  // Direct member access -- used for some "low level" operations
+  [[nodiscard]] inline CompactStaticArray<EdgeID> &raw_nodes() {
+    return _nodes;
+  }
+
+  [[nodiscard]] inline const CompactStaticArray<EdgeID> &raw_nodes() const {
+    return _nodes;
+  }
+
+  [[nodiscard]] inline StaticArray<NodeWeight> &raw_node_weights() {
+    return _node_weights;
+  }
+
+  [[nodiscard]] inline const StaticArray<NodeWeight> &raw_node_weights() const {
+    return _node_weights;
+  }
+
+  [[nodiscard]] inline CompactStaticArray<EdgeID> &&take_raw_nodes() {
+    return std::move(_nodes);
+  }
+
+  [[nodiscard]] inline StaticArray<NodeWeight> &&take_raw_node_weights() {
+    return std::move(_node_weights);
+  }
+
+  [[nodiscard]] const StaticArray<std::uint8_t> &raw_compressed_edges() const {
+    return _compressed_edges;
+  }
+
+  [[nodiscard]] const StaticArray<EdgeWeight> &raw_edge_weights() const {
+    return _edge_weights;
+  }
+
+  // Size of the graph
+  [[nodiscard]] NodeID n() const final {
+    return static_cast<NodeID>(_nodes.size() - 1);
+  };
+
+  [[nodiscard]] EdgeID m() const final {
+    return _edge_count;
+  }
+
+  // Node and edge weights
+  [[nodiscard]] inline bool node_weighted() const final {
+    return static_cast<NodeWeight>(n()) != total_node_weight();
+  }
+
+  [[nodiscard]] inline NodeWeight node_weight(const NodeID u) const final {
+    return node_weighted() ? _node_weights[u] : 1;
+  }
+
+  [[nodiscard]] inline NodeWeight max_node_weight() const final {
+    return _max_node_weight;
+  }
+
+  [[nodiscard]] inline NodeWeight total_node_weight() const final {
+    return _total_node_weight;
+  }
+
+  [[nodiscard]] inline bool edge_weighted() const final {
+    return static_cast<EdgeWeight>(m()) != total_edge_weight();
+  }
+
+  [[nodiscard]] inline EdgeWeight edge_weight(const EdgeID e) const final {
+    return edge_weighted() ? _edge_weights[e] : 1;
+  }
+
+  [[nodiscard]] inline EdgeWeight total_edge_weight() const final {
+    return _total_edge_weight;
+  }
+
+  // Low-level access to the graph structure
+  [[nodiscard]] inline NodeID max_degree() const final {
+    return _max_degree;
+  }
+
+  [[nodiscard]] inline NodeID degree(const NodeID node) const final {
+    const std::uint8_t *data = _compressed_edges.data();
+
+    const std::uint8_t *node_data = data + _nodes[node];
+    const std::uint8_t *next_node_data = data + _nodes[node + 1];
+
+    const bool is_isolated_node = node_data == next_node_data;
+    if (is_isolated_node) {
+      return 0;
+    }
+
+    const auto [first_edge, degree, _, __] = decode_header(node, node_data, next_node_data);
+    return degree;
+  }
+
+  // Iterators for nodes / edges
+  [[nodiscard]] IotaRange<NodeID> nodes() const final {
+    return IotaRange(static_cast<NodeID>(0), n());
+  }
+
+  [[nodiscard]] inline IotaRange<EdgeID> edges() const final {
+    return IotaRange(static_cast<EdgeID>(0), m());
+  }
+
+  // Parallel iteration
+  template <typename Lambda> inline void pfor_nodes(Lambda &&l) const {
+    tbb::parallel_for(static_cast<NodeID>(0), n(), std::forward<Lambda>(l));
+  }
+
+  template <typename Lambda> inline void pfor_edges(Lambda &&l) const {
+    tbb::parallel_for(static_cast<EdgeID>(0), m(), std::forward<Lambda>(l));
+  }
+
+  // Graph operations
+  [[nodiscard]] inline IotaRange<EdgeID> incident_edges(const NodeID node) const {
+    const std::uint8_t *data = _compressed_edges.data();
+
+    const std::uint8_t *node_data = data + _nodes[node];
+    const std::uint8_t *next_node_data = data + _nodes[node + 1];
+
+    const bool is_isolated_node = node_data == next_node_data;
+    if (is_isolated_node) {
+      return IotaRange<EdgeID>(0, 0);
+    }
+
+    const auto [first_edge, degree, _, __] = decode_header(node, node_data, next_node_data);
+    return IotaRange<EdgeID>(first_edge, first_edge + degree);
+  }
+
+  template <typename Lambda> inline void adjacent_nodes(const NodeID node, Lambda &&l) const {
+    iterate_neighborhood(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      l(adjacent_node);
+    });
+  }
+
+  template <typename Lambda> inline void neighbors(const NodeID node, Lambda &&l) const {
+    iterate_neighborhood(node, std::forward<Lambda>(l));
+  }
+
+  template <typename Lambda>
+  inline void neighbors(const NodeID node, const NodeID max_neighbor_count, Lambda &&l) const {
+    iterate_neighborhood<true>(node, std::forward<Lambda>(l), max_neighbor_count);
+  }
+
+  template <typename Lambda>
+  inline void pfor_neighbors(
+      const NodeID node, const NodeID max_neighbor_count, const NodeID grainsize, Lambda &&l
+  ) const {
+    iterate_neighborhood<true, true>(node, std::forward<Lambda>(l), max_neighbor_count);
+  }
+
+  // Graph permutation
+  inline void set_permutation(StaticArray<NodeID> permutation) final {
+    _permutation = std::move(permutation);
+  }
+
+  [[nodiscard]] inline bool permuted() const final {
+    return !_permutation.empty();
+  }
+
+  [[nodiscard]] inline NodeID map_original_node(const NodeID node) const final {
+    return _permutation[node];
+  }
+
+  [[nodiscard]] inline StaticArray<NodeID> &&take_raw_permutation() {
+    return std::move(_permutation);
+  }
+
+  // Degree buckets
+  [[nodiscard]] inline std::size_t bucket_size(const std::size_t bucket) const final {
+    return _buckets[bucket + 1] - _buckets[bucket];
+  }
+
+  [[nodiscard]] inline NodeID first_node_in_bucket(const std::size_t bucket) const final {
+    return _buckets[bucket];
+  }
+
+  [[nodiscard]] inline NodeID first_invalid_node_in_bucket(const std::size_t bucket) const final {
+    return first_node_in_bucket(bucket + 1);
+  }
+
+  [[nodiscard]] inline std::size_t number_of_buckets() const final {
+    return _number_of_buckets;
+  }
+
+  [[nodiscard]] inline bool sorted() const final {
+    return _sorted;
+  }
+
+  void update_total_node_weight() final;
+
+  void remove_isolated_nodes(const NodeID isolated_nodes);
+
+  void integrate_isolated_nodes();
+
+  // Compressions statistics
+
+  /*!
+   * Returns the number of nodes which have high degree.
+   *
+   * @returns The number of nodes which have high degree.
+   */
+  [[nodiscard]] std::size_t high_degree_count() const {
+    return _high_degree_count;
+  }
+
+  /*!
+   * Returns the number of parts that result from splitting the neighborhood of high degree nodes.
+   *
+   * @returns The number of parts that result from splitting the neighborhood of high degree nodes.
+   */
+  [[nodiscard]] std::size_t part_count() const {
+    return _part_count;
+  }
+
+  /*!
+   * Returns the number of nodes/parts which use interval encoding.
+   *
+   * @returns The number of nodes/parts which use interval encoding.
+   */
+  [[nodiscard]] std::size_t interval_count() const {
+    return _interval_count;
+  }
+
+  /*!
+   * Returns the compression ratio.
+   *
+   * @return The compression ratio.
+   */
+  [[nodiscard]] double compression_ratio() const {
+    std::size_t uncompressed_size = (n() + 1) * sizeof(EdgeID) + m() * sizeof(NodeID);
+    std::size_t compressed_size = _nodes.allocated_size() + _compressed_edges.size();
+
+    if (node_weighted()) {
+      uncompressed_size += n() * sizeof(NodeWeight);
+      compressed_size += n() * sizeof(NodeWeight);
+    }
+
+    if (edge_weighted()) {
+      uncompressed_size += m() * sizeof(EdgeWeight);
+      compressed_size += m() * sizeof(EdgeWeight);
+    }
+
+    return uncompressed_size / (double)compressed_size;
+  }
+
+  /**
+   * Returns the size reduction in bytes gained by the compression.
+   *
+   * @returns The size reduction in bytes gained by the compression.
+   */
+  [[nodiscard]] std::int64_t size_reduction() const {
+    std::size_t uncompressed_size = (n() + 1) * sizeof(EdgeID) + m() * sizeof(NodeID);
+    std::size_t compressed_size = _nodes.allocated_size() + _compressed_edges.size();
+
+    if (node_weighted()) {
+      uncompressed_size += n() * sizeof(NodeWeight);
+      compressed_size += n() * sizeof(NodeWeight);
+    }
+
+    if (edge_weighted()) {
+      uncompressed_size += m() * sizeof(EdgeWeight);
+      compressed_size += m() * sizeof(EdgeWeight);
+    }
+
+    return uncompressed_size - compressed_size;
+  }
+
+  /*!
+   * Returns the amount of memory in bytes used by the data structure.
+   *
+   * @return The amount of memory in bytes used by the data structure.
+   */
+  [[nodiscard]] std::size_t used_memory() const {
+    return _nodes.allocated_size() + _compressed_edges.size() +
+           _node_weights.size() * sizeof(NodeWeight) + _edge_weights.size() * sizeof(EdgeWeight);
+  }
+
+private:
+  CompactStaticArray<EdgeID> _nodes;
+  StaticArray<std::uint8_t> _compressed_edges;
+  StaticArray<NodeWeight> _node_weights;
+  StaticArray<EdgeWeight> _edge_weights;
+
+  EdgeID _edge_count;
+  NodeID _max_degree;
+
+  bool _sorted;
+
+  NodeWeight _total_node_weight = kInvalidNodeWeight;
+  EdgeWeight _total_edge_weight = kInvalidEdgeWeight;
+  NodeWeight _max_node_weight = kInvalidNodeWeight;
+
+  StaticArray<NodeID> _permutation;
+
+  std::vector<NodeID> _buckets = std::vector<NodeID>(kNumberOfDegreeBuckets<NodeID> + 1);
+  std::size_t _number_of_buckets = 0;
+
+  std::size_t _high_degree_count;
+  std::size_t _part_count;
+  std::size_t _interval_count;
+
+  void init_degree_buckets();
+
+  inline std::tuple<EdgeID, NodeID, bool, std::size_t> decode_header(
+      const NodeID node, const std::uint8_t *node_data, const std::uint8_t *next_node_data
+  ) const {
+    const auto [first_edge, next_first_edge, uses_intervals, len] = [&] {
+      if constexpr (CompressedGraph::kIntervalEncoding) {
+        auto [first_edge, marker_set, len] = marked_varint_decode<EdgeID>(node_data);
+        auto [next_first_edge, _, __] = marked_varint_decode<EdgeID>(next_node_data);
+
+        return std::make_tuple(first_edge, next_first_edge, marker_set, len);
+
+      } else {
+        auto [first_edge, len] = varint_decode<EdgeID>(node_data);
+        auto [next_first_edge, _] = varint_decode<EdgeID>(next_node_data);
+
+        return std::make_tuple(first_edge, next_first_edge, false, len);
+      }
+    }();
+
+    if constexpr (kIsolatedNodesSeparation) {
+      const EdgeID ungapped_first_edge = first_edge + node;
+      const NodeID degree = static_cast<NodeID>(1 + next_first_edge - first_edge);
+      return std::make_tuple(ungapped_first_edge, degree, uses_intervals, len);
+    } else {
+      const NodeID degree = static_cast<NodeID>(next_first_edge - first_edge);
+      return std::make_tuple(first_edge, degree, uses_intervals, len);
+    }
+  }
+
+  template <bool max_edges = false, bool parallel = false, typename Lambda>
+  inline void iterate_neighborhood(
+      const NodeID node, Lambda &&l, NodeID max_neighbor_count = std::numeric_limits<NodeID>::max()
+  ) const {
+    const std::uint8_t *data = _compressed_edges.data();
+
+    const std::uint8_t *node_data = data + _nodes[node];
+    const std::uint8_t *next_node_data = data + _nodes[node + 1];
+
+    const bool is_isolated_node = node_data == next_node_data;
+    if (is_isolated_node) {
+      return;
+    }
+
+    const auto [first_edge, degree, uses_intervals, len] =
+        decode_header(node, node_data, next_node_data);
+    node_data += len;
+
+    max_neighbor_count = std::min(max_neighbor_count, degree);
+
+    if constexpr (kHighDegreeEncoding) {
+      const bool split_neighbourhood = degree >= kHighDegreeThreshold;
+
+      if (split_neighbourhood) {
+        iterate_high_degree_neighborhood<max_edges, parallel>(
+            node_data, node, first_edge, degree, max_neighbor_count, std::forward<Lambda>(l)
+        );
+        return;
+      }
+    }
+
+    const EdgeID max_edge = first_edge + max_neighbor_count;
+    invoke_maybe_indirect<std::is_invocable_v<Lambda, EdgeID, NodeID>>(
+        std::forward<Lambda>(l),
+        [&, first_edge = first_edge, degree = degree, uses_intervals = uses_intervals](auto &&l2) {
+          iterate_edges<max_edges>(
+              node_data,
+              node,
+              degree,
+              first_edge,
+              max_edge,
+              uses_intervals,
+              std::forward<decltype(l2)>(l2)
+          );
+        }
+    );
+  }
+
+  template <bool max_edges, bool parallel, typename Lambda>
+  inline void iterate_high_degree_neighborhood(
+      const std::uint8_t *data,
+      const NodeID node,
+      const NodeID first_edge,
+      const NodeID degree,
+      const NodeID max_neighbor_count,
+      Lambda &&l
+  ) const {
+    const NodeID part_count = math::div_ceil(degree, kHighDegreePartLength);
+    const NodeID max_part_count =
+        std::min(part_count, math::div_ceil(max_neighbor_count, kHighDegreePartLength));
+    const NodeID max_neighbor_rem = ((max_neighbor_count % kHighDegreePartLength) == 0)
+                                        ? kHighDegreePartLength
+                                        : (max_neighbor_count % kHighDegreePartLength);
+
+    const auto iterate_part = [&](const NodeID part) {
+      const std::uint8_t *part_data = data + *((NodeID *)(data + sizeof(NodeID) * part));
+      const EdgeID part_first_edge = first_edge + kHighDegreePartLength * part;
+
+      const bool last_part = part + 1 == max_part_count;
+
+      if (last_part) {
+        const NodeID part_degree = (part == part_count - 1)
+                                       ? (degree - kHighDegreePartLength * (part_count - 1))
+                                       : kHighDegreePartLength;
+        const EdgeID part_max_edge = part_first_edge + max_neighbor_rem;
+
+        invoke_maybe_indirect<std::is_invocable_v<Lambda, EdgeID, NodeID>>(
+            std::forward<Lambda>(l),
+            [&](auto &&l2) {
+              iterate_edges<max_edges>(
+                  part_data,
+                  node,
+                  part_degree,
+                  part_first_edge,
+                  part_max_edge,
+                  true,
+                  std::forward<decltype(l2)>(l2)
+              );
+            }
+        );
+      } else {
+        const NodeID part_degree = kHighDegreePartLength;
+        const EdgeID part_max_edge = part_first_edge + part_degree;
+
+        invoke_maybe_indirect<std::is_invocable_v<Lambda, EdgeID, NodeID>>(
+            std::forward<Lambda>(l),
+            [&](auto &&l2) {
+              iterate_edges<false>(
+                  part_data,
+                  node,
+                  part_degree,
+                  part_first_edge,
+                  part_max_edge,
+                  true,
+                  std::forward<decltype(l2)>(l2)
+              );
+            }
+        );
+      }
+    };
+
+    if constexpr (parallel) {
+      tbb::parallel_for<NodeID>(
+          0, max_part_count, std::forward<decltype(iterate_part)>(iterate_part)
+      );
+    } else {
+      for (NodeID part = 0; part < max_part_count; ++part) {
+        iterate_part(part);
+      }
+    }
+  }
+
+  template <bool max_edges, typename Lambda>
+  inline void iterate_edges(
+      const std::uint8_t *data,
+      const NodeID node,
+      const NodeID degree,
+      const EdgeID first_edge,
+      const EdgeID max_edge,
+      const bool uses_intervals,
+      Lambda &&l
+  ) const {
+    constexpr bool non_stoppable =
+        std::is_void<std::invoke_result_t<Lambda, EdgeID, NodeID>>::value;
+
+    EdgeID edge = first_edge;
+    EdgeID gap_edges = degree - 1;
+
+    if constexpr (kIntervalEncoding) {
+      if (uses_intervals) {
+        const NodeID interval_count = *((NodeID *)data);
+        data += sizeof(NodeID);
+
+        NodeID previous_right_extreme = 2;
+        for (NodeID i = 0; i < interval_count; ++i) {
+          const auto [left_extreme_gap, left_extreme_gap_len] = varint_decode<NodeID>(data);
+          data += left_extreme_gap_len;
+
+          const auto [interval_length_gap, interval_length_gap_len] = varint_decode<NodeID>(data);
+          data += interval_length_gap_len;
+
+          const NodeID cur_left_extreme = left_extreme_gap + previous_right_extreme - 2;
+          const NodeID cur_interval_len = interval_length_gap + kIntervalLengthTreshold;
+          previous_right_extreme = cur_left_extreme + cur_interval_len - 1;
+
+          const NodeID max_interval_len = [&] {
+            if constexpr (max_edges) {
+              return std::min(cur_interval_len, static_cast<NodeID>(max_edge - edge));
+            } else {
+              return cur_interval_len;
+            }
+          }();
+          gap_edges -= cur_interval_len;
+
+          for (NodeID j = 0; j < max_interval_len; ++j) {
+            if constexpr (non_stoppable) {
+              l(edge++, cur_left_extreme + j);
+            } else {
+              const bool stop = l(edge++, cur_left_extreme + j);
+              if (stop) {
+                return;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (edge == max_edge) {
+      return;
+    }
+
+    const auto [first_gap, first_gap_len] = signed_varint_decode<SignedID>(data);
+    data += first_gap_len;
+
+    const NodeID first_adjacent_node = static_cast<NodeID>(first_gap + node);
+    NodeID prev_adjacent_node = first_adjacent_node;
+
+    if constexpr (non_stoppable) {
+      l(edge++, first_adjacent_node);
+    } else {
+      const bool stop = l(edge++, first_adjacent_node);
+      if (stop) {
+        return;
+      }
+    }
+
+    const auto handle_gap = [&](const NodeID gap) {
+      const NodeID adjacent_node = gap + prev_adjacent_node + 1;
+      prev_adjacent_node = adjacent_node;
+
+      if constexpr (non_stoppable) {
+        l(edge++, adjacent_node);
+      } else {
+        return l(edge++, adjacent_node);
+      }
+    };
+
+    if constexpr (kRunLengthEncoding) {
+      VarIntRunLengthDecoder<NodeID> rl_decoder(data);
+      rl_decoder.decode(max_edge - edge, std::forward<decltype(handle_gap)>(handle_gap));
+    } else if constexpr (kStreamEncoding) {
+      VarIntStreamDecoder<NodeID> sv_encoder(data, gap_edges);
+      sv_encoder.decode(max_edge - edge, std::forward<decltype(handle_gap)>(handle_gap));
+    } else {
+      while (edge != max_edge) {
+        const auto [gap, gap_len] = varint_decode<NodeID>(data);
+        data += gap_len;
+
+        const NodeID adjacent_node = gap + prev_adjacent_node + 1;
+        prev_adjacent_node = adjacent_node;
+
+        if constexpr (non_stoppable) {
+          l(edge++, adjacent_node);
+        } else {
+          const bool stop = l(edge++, adjacent_node);
+          if (stop) {
+            return;
+          }
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * A builder that constructs compressed graphs in a single read pass. It does this by overcommiting
+ * memory for the compressed edge array.
+ */
+class CompressedGraphBuilder {
+public:
+  using NodeID = CompressedGraph::NodeID;
+  using NodeWeight = CompressedGraph::NodeWeight;
+  using EdgeID = CompressedGraph::EdgeID;
+  using EdgeWeight = CompressedGraph::EdgeWeight;
+  using SignedID = CompressedGraph::SignedID;
+
+  /*!
+   * Gives an upper limit for the size of the compressed edge array in bytes.
+   *
+   * @param node_count The number of nodes in the graph.
+   * @param edge_count The number of edges in the graph.
+   * @return The max size in bytes of the compressed edge array.
+   */
+  [[nodiscard]] static std::size_t
+  compressed_edge_array_max_size(const NodeID node_count, const EdgeID edge_count);
+
+  /*!
+   * Compresses a graph in compressed sparse row format.
+   *
+   * @param graph The graph to compress.
+   * @return The compressed input graph.
+   */
+  static CompressedGraph compress(const CSRGraph &graph);
+
+  /*!
+   * Initializes the builder by allocating memory for the various arrays.
+   *
+   * @param node_count The number of nodes of the graph to compress.
+   * @param edge_count The number of edges of the graph to compress.
+   * @param store_node_weights Whether node weights are stored.
+   * @param store_edge_weights Whether edge weights are stored.
+   * @param sorted Whether the nodes to add are stored by deg-buckets order.
+   */
+  void init(
+      const NodeID node_count,
+      const EdgeID edge_count,
+      const bool store_node_weights,
+      const bool store_edge_weights,
+      const bool sorted
+  );
+
+  /*!
+   * Adds a node to the compressed graph, modifying the neighbourhood vector.
+   *
+   * @param node The node to add.
+   * @param neighbourhood The neighbourhood of the node to add, i.e. the adjacent nodes and the edge
+   * weight.
+   */
+  void add_node(const NodeID node, std::vector<std::pair<NodeID, EdgeWeight>> &neighbourhood);
+
+  /*!
+   * Sets the weight of a node.
+   *
+   * @param node The node whose weight is to be set.
+   * @param weight The weight to be set.
+   */
+  void set_node_weight(const NodeID node, const NodeWeight weight);
+
+  /*!
+   * Builds the compressed graph. The builder must then be reinitialized in order to compress
+   * another graph.
+   *
+   * @return The compressed graph that has been build.
+   */
+  CompressedGraph build();
+
+  /*!
+   * Returns the used memory of the compressed edge array.
+   *
+   * @return The used memory of the compressed edge array.
+   */
+  std::size_t edge_array_size() const;
+
+  /*!
+   * Returns the total weight of the nodes that have been added.
+   *
+   * @return The total weight of the nodes that have been added.
+   */
+  std::int64_t total_node_weight() const;
+
+  /*!
+   * Returns the total weight of the edges that have been added.
+   *
+   * @return The total weight of the edges that have been added.
+   */
+  std::int64_t total_edge_weight() const;
+
+private:
+  CompactStaticArray<EdgeID> _nodes;
+  StaticArray<NodeWeight> _node_weights;
+  StaticArray<EdgeWeight> _edge_weights;
+
+  bool _store_node_weights;
+  bool _store_edge_weights;
+  std::int64_t _total_node_weight;
+  std::int64_t _total_edge_weight;
+
+  bool _sorted;
+
+  std::uint8_t *_compressed_edges;
+  std::uint8_t *_cur_compressed_edges;
+
+  EdgeID _edge_count;
+  NodeID _max_degree;
+
+  bool _first_isolated_node;
+  EdgeID _last_real_edge;
+
+  std::size_t _high_degree_count;
+  std::size_t _part_count;
+  std::size_t _interval_count;
+
+  void add_edges(
+      const NodeID node,
+      std::uint8_t *marked_byte,
+      std::vector<std::pair<NodeID, EdgeWeight>> &neighbourhood
+  );
+};
+
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/csr_graph.cc b/kaminpar-shm/datastructures/csr_graph.cc
new file mode 100644
index 00000000..353509d1
--- /dev/null
+++ b/kaminpar-shm/datastructures/csr_graph.cc
@@ -0,0 +1,142 @@
+/*******************************************************************************
+ * Static uncompressed CSR graph data structure.
+ *
+ * @file:   csr_graph.cc
+ * @author: Daniel Seemaier
+ * @date:   21.09.2021
+ ******************************************************************************/
+#include "kaminpar-shm/datastructures/csr_graph.h"
+
+#include "kaminpar-shm/datastructures/graph.h"
+
+#include "kaminpar-common/logger.h"
+
+namespace kaminpar::shm {
+template <template <typename> typename Container, template <typename> typename CompactContainer>
+AbstractCSRGraph<Container, CompactContainer>::AbstractCSRGraph(const Graph &graph)
+    : _nodes(graph.n() + 1),
+      _edges(graph.m()),
+      _node_weights(graph.n()),
+      _edge_weights(graph.m()) {
+  graph.reified([&](const auto &graph) {
+    _nodes.front() = 0;
+    graph.pfor_nodes([&](const NodeID u) {
+      _nodes[u + 1] = graph.degree(u);
+      _node_weights[u] = graph.node_weight(u);
+    });
+    parallel::prefix_sum(_nodes.begin(), _nodes.end(), _nodes.begin());
+
+    graph.pfor_nodes([&](const NodeID u) {
+      graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
+        _edges[e] = v;
+        _edge_weights[e] = graph.edge_weight(e);
+      });
+    });
+
+    _total_node_weight = graph.total_node_weight();
+    _total_edge_weight = graph.total_edge_weight();
+    _max_degree = graph.max_degree();
+    init_degree_buckets();
+  });
+}
+
+template AbstractCSRGraph<StaticArray, StaticArray>::AbstractCSRGraph(const Graph &graph);
+
+namespace debug {
+bool validate_graph(
+    const CSRGraph &graph, const bool check_undirected, const NodeID num_pseudo_nodes
+) {
+  for (NodeID u = 0; u < graph.n(); ++u) {
+    if (graph.raw_nodes()[u] > graph.raw_nodes()[u + 1]) {
+      LOG_WARNING << "Bad node array at position " << u;
+      return false;
+    }
+  }
+
+  for (const NodeID u : graph.nodes()) {
+    for (const auto [e, v] : graph.neighbors(u)) {
+      if (v >= graph.n()) {
+        LOG_WARNING << "Neighbor " << v << " of " << u << " is out-of-graph";
+        return false;
+      }
+
+      if (u == v) {
+        LOG_WARNING << "Self-loop at " << u;
+        return false;
+      }
+
+      bool found_reverse = false;
+      for (const auto [e_prime, u_prime] : graph.neighbors(v)) {
+        if (u_prime >= graph.n()) {
+          LOG_WARNING << "Neighbor " << u_prime << " of neighbor " << v << " of " << u
+                      << " is out-of-graph";
+          return false;
+        }
+
+        if (u != u_prime) {
+          continue;
+        }
+
+        if (graph.edge_weight(e) != graph.edge_weight(e_prime)) {
+          LOG_WARNING << "Weight of edge " << e << " (" << graph.edge_weight(e)
+                      << ") differs from the weight of its reverse edge " << e_prime << " ("
+                      << graph.edge_weight(e_prime) << ")";
+          return false;
+        }
+
+        found_reverse = true;
+        break;
+      }
+
+      if (check_undirected && v < graph.n() - num_pseudo_nodes && !found_reverse) {
+        LOG_WARNING << "Edge " << u << " --> " << v << " exists with edge " << e
+                    << ", but the reverse edges does not exist";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+CSRGraph sort_neighbors(CSRGraph graph) {
+  const bool sorted = graph.sorted();
+  const bool edge_weighted = graph.edge_weighted();
+
+  StaticArray<EdgeID> nodes = graph.take_raw_nodes();
+  StaticArray<NodeID> edges = graph.take_raw_edges();
+  StaticArray<NodeWeight> node_weights = graph.take_raw_node_weights();
+  StaticArray<EdgeWeight> edge_weights = graph.take_raw_edge_weights();
+
+  if (edge_weighted) {
+    StaticArray<std::pair<NodeID, EdgeWeight>> zipped(edges.size());
+    tbb::parallel_for<EdgeID>(static_cast<EdgeID>(0), edges.size(), [&](const EdgeID e) {
+      zipped[e] = {edges[e], edge_weights[e]};
+    });
+
+    tbb::parallel_for<NodeID>(0, nodes.size() - 1, [&](const NodeID u) {
+      std::sort(
+          zipped.begin() + nodes[u],
+          zipped.begin() + nodes[u + 1],
+          [](const auto &a, const auto &b) { return a.first < b.first; }
+      );
+    });
+
+    tbb::parallel_for<EdgeID>(static_cast<EdgeID>(0), edges.size(), [&](const EdgeID e) {
+      std::tie(edges[e], edge_weights[e]) = zipped[e];
+    });
+  } else {
+    tbb::parallel_for<NodeID>(0, nodes.size() - 1, [&](const NodeID u) {
+      std::sort(edges.begin() + nodes[u], edges.begin() + nodes[u + 1]);
+    });
+  }
+
+  CSRGraph sorted_graph(
+      std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), sorted
+  );
+  sorted_graph.set_permutation(graph.take_raw_permutation());
+
+  return sorted_graph;
+}
+} // namespace debug
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/csr_graph.h b/kaminpar-shm/datastructures/csr_graph.h
new file mode 100644
index 00000000..c06997c2
--- /dev/null
+++ b/kaminpar-shm/datastructures/csr_graph.h
@@ -0,0 +1,529 @@
+/*******************************************************************************
+ * Static uncompressed CSR graph data structure.
+ *
+ * @file:   csr_graph.h
+ * @author: Daniel Seemaier
+ * @date:   21.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include <kassert/kassert.hpp>
+#include <tbb/blocked_range.h>
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
+#include "kaminpar-shm/datastructures/abstract_graph.h"
+#include "kaminpar-shm/kaminpar.h"
+
+#include "kaminpar-common/constexpr_utils.h"
+#include "kaminpar-common/datastructures/compact_static_array.h"
+#include "kaminpar-common/datastructures/static_array.h"
+#include "kaminpar-common/degree_buckets.h"
+#include "kaminpar-common/parallel/algorithm.h"
+#include "kaminpar-common/ranges.h"
+
+namespace kaminpar::shm {
+template <template <typename> typename Container, template <typename> typename CompactContainer>
+class AbstractCSRGraph : public AbstractGraph {
+public:
+  // Data types used by this graph
+  using AbstractGraph::EdgeID;
+  using AbstractGraph::EdgeWeight;
+  using AbstractGraph::NodeID;
+  using AbstractGraph::NodeWeight;
+
+  // Tag for the sequential ctor.
+  struct seq {};
+
+  explicit AbstractCSRGraph(const class Graph &graph);
+
+  AbstractCSRGraph(
+      Container<EdgeID> nodes,
+      CompactContainer<NodeID> edges,
+      Container<NodeWeight> node_weights = {},
+      CompactContainer<EdgeWeight> edge_weights = {},
+      bool sorted = false
+  )
+      : _nodes(std::move(nodes)),
+        _edges(std::move(edges)),
+        _node_weights(std::move(node_weights)),
+        _edge_weights(std::move(edge_weights)),
+        _sorted(sorted) {
+    if (_node_weights.empty()) {
+      _total_node_weight = static_cast<NodeWeight>(n());
+      _max_node_weight = 1;
+    } else {
+      _total_node_weight = parallel::accumulate(_node_weights, static_cast<NodeWeight>(0));
+      _max_node_weight = parallel::max_element(_node_weights);
+    }
+
+    if (_edge_weights.empty()) {
+      _total_edge_weight = static_cast<EdgeWeight>(m());
+    } else {
+      _total_edge_weight = parallel::accumulate(_edge_weights, static_cast<EdgeWeight>(0));
+    }
+
+    _max_degree = parallel::max_difference(_nodes.begin(), _nodes.end());
+
+    init_degree_buckets();
+  }
+
+  AbstractCSRGraph(
+      seq,
+      Container<EdgeID> nodes,
+      CompactContainer<NodeID> edges,
+      Container<NodeWeight> node_weights = {},
+      CompactContainer<EdgeWeight> edge_weights = {},
+      bool sorted = false
+  )
+      : _nodes(std::move(nodes)),
+        _edges(std::move(edges)),
+        _node_weights(std::move(node_weights)),
+        _edge_weights(std::move(edge_weights)),
+        _sorted(sorted) {
+    if (_node_weights.empty()) {
+      _total_node_weight = static_cast<NodeWeight>(n());
+      _max_node_weight = 1;
+    } else {
+      _total_node_weight =
+          std::accumulate(_node_weights.begin(), _node_weights.end(), static_cast<NodeWeight>(0));
+      _max_node_weight = *std::max_element(_node_weights.begin(), _node_weights.end());
+    }
+
+    if (_edge_weights.empty()) {
+      _total_edge_weight = static_cast<EdgeWeight>(m());
+    } else {
+      _total_edge_weight =
+          std::accumulate(_edge_weights.begin(), _edge_weights.end(), static_cast<EdgeWeight>(0));
+    }
+
+    init_degree_buckets();
+  }
+
+  AbstractCSRGraph(const AbstractCSRGraph &) = delete;
+  AbstractCSRGraph &operator=(const AbstractCSRGraph &) = delete;
+
+  AbstractCSRGraph(AbstractCSRGraph &&) noexcept = default;
+  AbstractCSRGraph &operator=(AbstractCSRGraph &&) noexcept = default;
+
+  ~AbstractCSRGraph() override = default;
+
+  template <typename Lambda> decltype(auto) reified(Lambda &&l) const {
+    return l(*this);
+  }
+
+  // Direct member access -- used for some "low level" operations
+  [[nodiscard]] inline Container<EdgeID> &raw_nodes() {
+    return _nodes;
+  }
+
+  [[nodiscard]] inline const Container<EdgeID> &raw_nodes() const {
+    return _nodes;
+  }
+
+  [[nodiscard]] inline CompactContainer<NodeID> &raw_edges() {
+    return _edges;
+  }
+
+  [[nodiscard]] inline const CompactContainer<NodeID> &raw_edges() const {
+    return _edges;
+  }
+
+  [[nodiscard]] inline Container<NodeWeight> &raw_node_weights() {
+    return _node_weights;
+  }
+
+  [[nodiscard]] inline const Container<NodeWeight> &raw_node_weights() const {
+    return _node_weights;
+  }
+
+  [[nodiscard]] inline CompactContainer<EdgeWeight> &raw_edge_weights() {
+    return _edge_weights;
+  }
+
+  [[nodiscard]] inline const CompactContainer<EdgeWeight> &raw_edge_weights() const {
+    return _edge_weights;
+  }
+
+  [[nodiscard]] inline Container<EdgeID> &&take_raw_nodes() {
+    return std::move(_nodes);
+  }
+
+  [[nodiscard]] inline CompactContainer<NodeID> &&take_raw_edges() {
+    return std::move(_edges);
+  }
+
+  [[nodiscard]] inline Container<NodeWeight> &&take_raw_node_weights() {
+    return std::move(_node_weights);
+  }
+
+  [[nodiscard]] inline CompactContainer<EdgeWeight> &&take_raw_edge_weights() {
+    return std::move(_edge_weights);
+  }
+
+  // Size of the graph
+  [[nodiscard]] inline NodeID n() const final {
+    return static_cast<NodeID>(_nodes.size() - 1);
+  }
+
+  [[nodiscard]] inline EdgeID m() const final {
+    return static_cast<EdgeID>(_edges.size());
+  }
+
+  // Node and edge weights
+  [[nodiscard]] inline bool node_weighted() const final {
+    return static_cast<NodeWeight>(n()) != total_node_weight();
+  }
+
+  [[nodiscard]] inline NodeWeight node_weight(const NodeID u) const final {
+    KASSERT(!node_weighted() || u < _node_weights.size());
+    return node_weighted() ? _node_weights[u] : 1;
+  }
+
+  [[nodiscard]] inline NodeWeight max_node_weight() const final {
+    return _max_node_weight;
+  }
+
+  [[nodiscard]] inline NodeWeight total_node_weight() const final {
+    return _total_node_weight;
+  }
+
+  [[nodiscard]] inline bool edge_weighted() const final {
+    return static_cast<EdgeWeight>(m()) != total_edge_weight();
+  }
+
+  [[nodiscard]] inline EdgeWeight edge_weight(const EdgeID e) const final {
+    KASSERT(!edge_weighted() || e < _edge_weights.size());
+    return edge_weighted() ? _edge_weights[e] : 1;
+  }
+
+  [[nodiscard]] inline EdgeWeight total_edge_weight() const final {
+    return _total_edge_weight;
+  }
+
+  // Low-level access to the graph structure
+  [[nodiscard]] inline NodeID max_degree() const final {
+    return _max_degree;
+  }
+
+  [[nodiscard]] inline NodeID degree(const NodeID u) const final {
+    return static_cast<NodeID>(_nodes[u + 1] - _nodes[u]);
+  }
+
+  // This function is not part of the Graph interface:
+  [[nodiscard]] EdgeID first_edge(const NodeID u) const {
+    return _nodes[u];
+  }
+
+  // This function is not part of the Graph interface:
+  [[nodiscard]] EdgeID first_invalid_edge(const NodeID u) const {
+    return _nodes[u + 1];
+  }
+
+  // This function is not part of the Graph interface:
+  [[nodiscard]] NodeID edge_target(const EdgeID e) const {
+    return _edges[e];
+  }
+
+  // Iterators for nodes / edges
+  [[nodiscard]] inline IotaRange<NodeID> nodes() const final {
+    return {static_cast<NodeID>(0), n()};
+  }
+
+  [[nodiscard]] inline IotaRange<EdgeID> edges() const final {
+    return {static_cast<EdgeID>(0), m()};
+  }
+
+  // Parallel iteration
+  template <typename Lambda> inline void pfor_nodes(Lambda &&l) const {
+    tbb::parallel_for(static_cast<NodeID>(0), n(), std::forward<Lambda>(l));
+  }
+
+  template <typename Lambda> inline void pfor_edges(Lambda &&l) const {
+    tbb::parallel_for(static_cast<EdgeID>(0), m(), std::forward<Lambda>(l));
+  }
+
+  // Graph operations
+  [[nodiscard]] inline IotaRange<EdgeID> incident_edges(const NodeID u) const {
+    KASSERT(u + 1 < _nodes.size());
+    return {_nodes[u], _nodes[u + 1]};
+  }
+
+  [[nodiscard]] inline auto adjacent_nodes(const NodeID u) const {
+    KASSERT(u + 1 < _nodes.size());
+    return TransformedIotaRange(_nodes[u], _nodes[u + 1], [this](const EdgeID e) {
+      return _edges[e];
+    });
+  }
+
+  template <typename Lambda> inline void adjacent_nodes(const NodeID u, Lambda &&l) const {
+    KASSERT(u + 1 < _nodes.size());
+
+    const EdgeID from = _nodes[u];
+    const EdgeID to = _nodes[u + 1];
+    for (EdgeID edge = from; edge < to; ++edge) {
+      l(_edges[edge]);
+    }
+  }
+
+  [[nodiscard]] inline auto neighbors(const NodeID u) const {
+    KASSERT(u + 1 < _nodes.size());
+    return TransformedIotaRange(_nodes[u], _nodes[u + 1], [this](const EdgeID e) {
+      return std::make_pair(e, _edges[e]);
+    });
+  }
+
+  template <typename Lambda> inline void neighbors(const NodeID u, Lambda &&l) const {
+    KASSERT(u + 1 < _nodes.size());
+
+    const EdgeID from = _nodes[u];
+    const EdgeID to = _nodes[u + 1];
+    for (EdgeID edge = from; edge < to; ++edge) {
+      l(edge, _edges[edge]);
+    }
+  }
+
+  template <typename Lambda>
+  inline void neighbors(const NodeID u, const NodeID max_neighbor_count, Lambda &&l) const {
+    KASSERT(u + 1 < _nodes.size());
+    constexpr bool non_stoppable =
+        std::is_void<std::invoke_result_t<Lambda, EdgeID, NodeID>>::value;
+
+    const EdgeID from = _nodes[u];
+    const EdgeID to = from + std::min(degree(u), max_neighbor_count);
+
+    for (EdgeID edge = from; edge < to; ++edge) {
+      if constexpr (non_stoppable) {
+        l(edge, _edges[edge]);
+      } else {
+        if (l(edge, _edges[edge])) {
+          return;
+        }
+      }
+    }
+  }
+
+  template <typename Lambda>
+  inline void pfor_neighbors(
+      const NodeID u, const NodeID max_neighbor_count, const NodeID grainsize, Lambda &&l
+  ) const {
+    KASSERT(u + 1 < _nodes.size());
+
+    const EdgeID from = _nodes[u];
+    const EdgeID to = from + std::min(degree(u), max_neighbor_count);
+
+    tbb::parallel_for(
+        tbb::blocked_range<EdgeID>(from, to, grainsize),
+        [&](const tbb::blocked_range<EdgeID> range) {
+          const auto end = range.end();
+
+          invoke_maybe_indirect<std::is_invocable_v<Lambda, EdgeID, NodeID>>(
+              std::forward<Lambda>(l),
+              [&](auto &&l2) {
+                for (EdgeID e = range.begin(); e < end; ++e) {
+                  l2(e, _edges[e]);
+                }
+              }
+          );
+        }
+    );
+  }
+
+  // Graph permutation
+  inline void set_permutation(StaticArray<NodeID> permutation) final {
+    _permutation = std::move(permutation);
+  }
+
+  [[nodiscard]] inline bool permuted() const final {
+    return !_permutation.empty();
+  }
+
+  [[nodiscard]] inline NodeID map_original_node(const NodeID u) const final {
+    KASSERT(u < _permutation.size());
+    return _permutation[u];
+  }
+
+  [[nodiscard]] inline StaticArray<NodeID> &&take_raw_permutation() final {
+    return std::move(_permutation);
+  }
+
+  // Degree buckets
+  [[nodiscard]] inline std::size_t bucket_size(const std::size_t bucket) const final {
+    return _buckets[bucket + 1] - _buckets[bucket];
+  }
+
+  [[nodiscard]] inline NodeID first_node_in_bucket(const std::size_t bucket) const final {
+    return _buckets[bucket];
+  }
+
+  [[nodiscard]] inline NodeID first_invalid_node_in_bucket(const std::size_t bucket) const final {
+    return first_node_in_bucket(bucket + 1);
+  }
+
+  [[nodiscard]] inline std::size_t number_of_buckets() const final {
+    return _number_of_buckets;
+  }
+
+  [[nodiscard]] inline bool sorted() const final {
+    return _sorted;
+  }
+
+  void update_total_node_weight() final {
+    if (_node_weights.empty()) {
+      _total_node_weight = n();
+      _max_node_weight = 1;
+    } else {
+      _total_node_weight =
+          std::accumulate(_node_weights.begin(), _node_weights.end(), static_cast<NodeWeight>(0));
+      _max_node_weight = *std::max_element(_node_weights.begin(), _node_weights.end());
+    }
+  }
+
+  void remove_isolated_nodes(const NodeID isolated_nodes) {
+    KASSERT(sorted());
+
+    if (isolated_nodes == 0) {
+      return;
+    }
+
+    const NodeID new_n = n() - isolated_nodes;
+    _nodes.restrict(new_n + 1);
+    if (!_node_weights.empty()) {
+      _node_weights.restrict(new_n);
+    }
+
+    update_total_node_weight();
+
+    // Update degree buckets
+    for (std::size_t i = 0; i < _buckets.size() - 1; ++i) {
+      _buckets[1 + i] -= isolated_nodes;
+    }
+
+    // If the graph has only isolated nodes then there are no buckets afterwards
+    if (_number_of_buckets == 1) {
+      _number_of_buckets = 0;
+    }
+  }
+
+  void integrate_isolated_nodes() {
+    KASSERT(sorted());
+
+    const NodeID nonisolated_nodes = n();
+    _nodes.unrestrict();
+    _node_weights.unrestrict();
+
+    const NodeID isolated_nodes = n() - nonisolated_nodes;
+    update_total_node_weight();
+
+    // Update degree buckets
+    for (std::size_t i = 0; i < _buckets.size() - 1; ++i) {
+      _buckets[1 + i] += isolated_nodes;
+    }
+
+    // If the graph has only isolated nodes then there is one afterwards
+    if (_number_of_buckets == 0) {
+      _number_of_buckets = 1;
+    }
+  }
+
+  std::size_t node_id_byte_width() const {
+    if constexpr (std::is_same_v<CompactContainer<NodeID>, CompactStaticArray<NodeID>>) {
+      return _edges.byte_width();
+    }
+
+    return sizeof(NodeID);
+  }
+
+  std::size_t edge_weight_byte_width() const {
+    if constexpr (std::is_same_v<CompactContainer<EdgeWeight>, CompactStaticArray<EdgeWeight>>) {
+      return _edge_weights.byte_width();
+    }
+
+    return sizeof(EdgeWeight);
+  }
+
+private:
+  void init_degree_buckets() {
+    KASSERT(std::all_of(_buckets.begin(), _buckets.end(), [](const auto n) { return n == 0; }));
+
+    constexpr std::size_t kNumBuckets = kNumberOfDegreeBuckets<NodeID> + 1;
+
+    if (_sorted) {
+      tbb::enumerable_thread_specific<std::array<NodeID, kNumBuckets>> buckets_ets([&] {
+        return std::array<NodeID, kNumBuckets>{};
+      });
+
+      tbb::parallel_for(
+          tbb::blocked_range<NodeID>(0, n()),
+          [&](const tbb::blocked_range<NodeID> r) {
+            auto &buckets = buckets_ets.local();
+            for (NodeID u = r.begin(); u != r.end(); ++u) {
+              ++buckets[degree_bucket(degree(u)) + 1];
+            }
+          }
+      );
+
+      std::fill(_buckets.begin(), _buckets.end(), 0);
+      for (auto &local_buckets : buckets_ets) {
+        for (std::size_t i = 0; i < kNumBuckets; ++i) {
+          _buckets[i] += local_buckets[i];
+        }
+      }
+
+      KASSERT(
+          [&] {
+            std::vector<NodeID> buckets2(_buckets.size());
+            for (const NodeID u : nodes()) {
+              ++buckets2[degree_bucket(degree(u)) + 1];
+            }
+            for (std::size_t i = 0; i < _buckets.size(); ++i) {
+              if (_buckets[i] != buckets2[i]) {
+                return false;
+              }
+            }
+            return true;
+          }(),
+          "",
+          assert::heavy
+      );
+      auto last_nonempty_bucket =
+          std::find_if(_buckets.rbegin(), _buckets.rend(), [](const auto n) { return n > 0; });
+      _number_of_buckets = std::distance(_buckets.begin(), (last_nonempty_bucket + 1).base());
+    } else {
+      _buckets[1] = n();
+      _number_of_buckets = 1;
+    }
+
+    std::partial_sum(_buckets.begin(), _buckets.end(), _buckets.begin());
+  }
+
+  Container<EdgeID> _nodes;
+  CompactContainer<NodeID> _edges;
+  Container<NodeWeight> _node_weights;
+  CompactContainer<EdgeWeight> _edge_weights;
+
+  NodeWeight _total_node_weight = kInvalidNodeWeight;
+  EdgeWeight _total_edge_weight = kInvalidEdgeWeight;
+  NodeWeight _max_node_weight = kInvalidNodeWeight;
+
+  NodeID _max_degree;
+
+  StaticArray<NodeID> _permutation;
+  bool _sorted;
+  std::vector<NodeID> _buckets = std::vector<NodeID>(kNumberOfDegreeBuckets<NodeID> + 1);
+  std::size_t _number_of_buckets = 0;
+};
+
+using CSRGraph = AbstractCSRGraph<StaticArray, StaticArray>;
+using CompactCSRGraph = AbstractCSRGraph<StaticArray, CompactStaticArray>;
+
+namespace debug {
+bool validate_graph(const CSRGraph &graph, bool undirected = true, NodeID num_pseudo_nodes = 0);
+CSRGraph sort_neighbors(CSRGraph graph);
+} // namespace debug
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/delta_partitioned_graph.h b/kaminpar-shm/datastructures/delta_partitioned_graph.h
index 12c729de..6a3b4c4a 100644
--- a/kaminpar-shm/datastructures/delta_partitioned_graph.h
+++ b/kaminpar-shm/datastructures/delta_partitioned_graph.h
@@ -22,7 +22,7 @@ template <
     // If false, store the block weight changes in a vector of size k, otherwise
     // use a hash map.
     bool compact_block_weight_delta = false>
-class GenericDeltaPartitionedGraph : public GraphDelegate {
+class GenericDeltaPartitionedGraph : public GraphDelegate<Graph> {
   struct DeltaEntry {
     NodeID node;
     BlockID block;
@@ -32,7 +32,7 @@ class GenericDeltaPartitionedGraph : public GraphDelegate {
   constexpr static bool kAllowsReadAfterMove = allow_read_after_move;
 
   GenericDeltaPartitionedGraph(const PartitionedGraph *p_graph)
-      : GraphDelegate(&p_graph->graph()),
+      : GraphDelegate<Graph>(&p_graph->graph()),
         _p_graph(p_graph) {
     if constexpr (!compact_block_weight_delta) {
       _block_weights_delta.resize(_p_graph->k());
@@ -83,7 +83,8 @@ class GenericDeltaPartitionedGraph : public GraphDelegate {
       const BlockID old_block = block(node);
       KASSERT(old_block < k());
 
-      _block_weights_delta[old_block] -= node_weight(node);
+      const NodeWeight w = node_weight(node);
+      _block_weights_delta[old_block] -= w;
       _block_weights_delta[new_block] += node_weight(node);
     }
 
diff --git a/kaminpar-shm/datastructures/graph.cc b/kaminpar-shm/datastructures/graph.cc
index 059e66df..a19e184e 100644
--- a/kaminpar-shm/datastructures/graph.cc
+++ b/kaminpar-shm/datastructures/graph.cc
@@ -1,108 +1,22 @@
 /*******************************************************************************
- * Static graph with CSR representation.
+ * Wrapper class that delegates all function calls to a concrete graph object.
+ *
+ * Most function calls are resolved via dynamic binding. Thus, they should not
+ * be used when performance is critical. Instead, use an downcast and templatize
+ * tight loops.
  *
  * @file:   graph.cc
  * @author: Daniel Seemaier
- * @date:   21.09.2021
+ * @date:   17.11.2023
  ******************************************************************************/
 #include "kaminpar-shm/datastructures/graph.h"
 
-#include <numeric>
-
 #include "kaminpar-shm/kaminpar.h"
 
-#include "kaminpar-common/assert.h"
 #include "kaminpar-common/logger.h"
-#include "kaminpar-common/parallel/algorithm.h"
 
 namespace kaminpar::shm {
-Graph::Graph(
-    StaticArray<EdgeID> nodes,
-    StaticArray<NodeID> edges,
-    StaticArray<NodeWeight> node_weights,
-    StaticArray<EdgeWeight> edge_weights,
-    const bool sorted
-)
-    : _nodes(std::move(nodes)),
-      _edges(std::move(edges)),
-      _node_weights(std::move(node_weights)),
-      _edge_weights(std::move(edge_weights)),
-      _sorted(sorted) {
-  if (_node_weights.empty()) {
-    _total_node_weight = static_cast<NodeWeight>(n());
-    _max_node_weight = 1;
-  } else {
-    _total_node_weight = parallel::accumulate(_node_weights, static_cast<NodeWeight>(0));
-    _max_node_weight = parallel::max_element(_node_weights);
-  }
-
-  if (_edge_weights.empty()) {
-    _total_edge_weight = static_cast<EdgeWeight>(m());
-  } else {
-    _total_edge_weight = parallel::accumulate(_edge_weights, static_cast<EdgeWeight>(0));
-  }
-
-  init_degree_buckets();
-}
-
-Graph::Graph(
-    seq,
-    StaticArray<EdgeID> nodes,
-    StaticArray<NodeID> edges,
-    StaticArray<NodeWeight> node_weights,
-    StaticArray<EdgeWeight> edge_weights,
-    const bool sorted
-)
-    : _nodes(std::move(nodes)),
-      _edges(std::move(edges)),
-      _node_weights(std::move(node_weights)),
-      _edge_weights(std::move(edge_weights)),
-      _sorted(sorted) {
-  if (_node_weights.empty()) {
-    _total_node_weight = static_cast<NodeWeight>(n());
-    _max_node_weight = 1;
-  } else {
-    _total_node_weight =
-        std::accumulate(_node_weights.begin(), _node_weights.end(), static_cast<NodeWeight>(0));
-    _max_node_weight = *std::max_element(_node_weights.begin(), _node_weights.end());
-  }
-
-  if (_edge_weights.empty()) {
-    _total_edge_weight = static_cast<EdgeWeight>(m());
-  } else {
-    _total_edge_weight =
-        std::accumulate(_edge_weights.begin(), _edge_weights.end(), static_cast<EdgeWeight>(0));
-  }
-
-  init_degree_buckets();
-}
-
-void Graph::init_degree_buckets() {
-  KASSERT(std::all_of(_buckets.begin(), _buckets.end(), [](const auto n) { return n == 0; }));
-  if (_sorted) {
-    for (const NodeID u : nodes()) {
-      ++_buckets[degree_bucket(degree(u)) + 1];
-    }
-    auto last_nonempty_bucket =
-        std::find_if(_buckets.rbegin(), _buckets.rend(), [](const auto n) { return n > 0; });
-    _number_of_buckets = std::distance(_buckets.begin(), (last_nonempty_bucket + 1).base());
-  } else {
-    _buckets[1] = n();
-    _number_of_buckets = 1;
-  }
-  std::partial_sum(_buckets.begin(), _buckets.end(), _buckets.begin());
-}
-
-void Graph::update_total_node_weight() {
-  if (_node_weights.empty()) {
-    _total_node_weight = n();
-    _max_node_weight = 1;
-  } else {
-    _total_node_weight =
-        std::accumulate(_node_weights.begin(), _node_weights.end(), static_cast<NodeWeight>(0));
-    _max_node_weight = *std::max_element(_node_weights.begin(), _node_weights.end());
-  }
-}
+Graph::Graph(std::unique_ptr<AbstractGraph> graph) : _underlying_graph(std::move(graph)) {}
 
 //
 // Utility debug functions
@@ -118,95 +32,5 @@ void print_graph(const Graph &graph) {
     LOG;
   }
 }
-
-bool validate_graph(
-    const Graph &graph, const bool check_undirected, const NodeID num_pseudo_nodes
-) {
-  for (NodeID u = 0; u < graph.n(); ++u) {
-    if (graph.raw_nodes()[u] > graph.raw_nodes()[u + 1]) {
-      LOG_WARNING << "Bad node array at position " << u;
-      return false;
-    }
-  }
-
-  for (const NodeID u : graph.nodes()) {
-    for (const auto [e, v] : graph.neighbors(u)) {
-      if (v >= graph.n()) {
-        LOG_WARNING << "Neighbor " << v << " of " << u << " is out-of-graph";
-        return false;
-      }
-      if (u == v) {
-        LOG_WARNING << "Self-loop at " << u;
-        return false;
-      }
-
-      bool found_reverse = false;
-      for (const auto [e_prime, u_prime] : graph.neighbors(v)) {
-        if (u_prime >= graph.n()) {
-          LOG_WARNING << "Neighbor " << u_prime << " of neighbor " << v << " of " << u
-                      << " is out-of-graph";
-          return false;
-        }
-        if (u != u_prime) {
-          continue;
-        }
-        if (graph.edge_weight(e) != graph.edge_weight(e_prime)) {
-          LOG_WARNING << "Weight of edge " << e << " (" << graph.edge_weight(e)
-                      << ") differs from the weight of its reverse edge " << e_prime << " ("
-                      << graph.edge_weight(e_prime) << ")";
-          return false;
-        }
-        found_reverse = true;
-        break;
-      }
-      if (check_undirected && v < graph.n() - num_pseudo_nodes && !found_reverse) {
-        LOG_WARNING << "Edge " << u << " --> " << v << " exists with edge " << e
-                    << ", but the reverse edges does not exist";
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-Graph sort_neighbors(Graph graph) {
-  const bool sorted = graph.sorted();
-  const bool edge_weighted = graph.edge_weighted();
-
-  StaticArray<EdgeID> nodes = graph.take_raw_nodes();
-  StaticArray<NodeID> edges = graph.take_raw_edges();
-  StaticArray<NodeWeight> node_weights = graph.take_raw_node_weights();
-  StaticArray<EdgeWeight> edge_weights = graph.take_raw_edge_weights();
-
-  if (edge_weighted) {
-    StaticArray<std::pair<NodeID, EdgeWeight>> zipped(edges.size());
-    tbb::parallel_for<EdgeID>(static_cast<EdgeID>(0), edges.size(), [&](const EdgeID e) {
-      zipped[e] = {edges[e], edge_weights[e]};
-    });
-
-    tbb::parallel_for<NodeID>(0, nodes.size() - 1, [&](const NodeID u) {
-      std::sort(
-          zipped.begin() + nodes[u],
-          zipped.begin() + nodes[u + 1],
-          [](const auto &a, const auto &b) { return a.first < b.first; }
-      );
-    });
-
-    tbb::parallel_for<EdgeID>(static_cast<EdgeID>(0), edges.size(), [&](const EdgeID e) {
-      std::tie(edges[e], edge_weights[e]) = zipped[e];
-    });
-  } else {
-    tbb::parallel_for<NodeID>(0, nodes.size() - 1, [&](const NodeID u) {
-      std::sort(edges.begin() + nodes[u], edges.begin() + nodes[u + 1]);
-    });
-  }
-
-  Graph sorted_graph(
-      std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), sorted
-  );
-  sorted_graph.set_permutation(graph.take_raw_permutation());
-
-  return sorted_graph;
-}
 } // namespace debug
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/graph.h b/kaminpar-shm/datastructures/graph.h
index ce412122..650b3fa0 100644
--- a/kaminpar-shm/datastructures/graph.h
+++ b/kaminpar-shm/datastructures/graph.h
@@ -1,57 +1,59 @@
 /*******************************************************************************
- * Static graph with CSR representation.
+ * Wrapper class that delegates all function calls to a concrete graph object.
+ *
+ * Most function calls are resolved via dynamic binding. Thus, they should not
+ * be used when performance is critical. Instead, use an downcast and templatize
+ * tight loops.
  *
  * @file:   graph.h
  * @author: Daniel Seemaier
- * @date:   21.09.2021
+ * @date:   17.11.2023
  ******************************************************************************/
 #pragma once
 
 #include <utility>
-#include <vector>
 
 #include <tbb/blocked_range.h>
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_reduce.h>
 
+#include "kaminpar-shm/datastructures/abstract_graph.h"
+#include "kaminpar-shm/datastructures/compressed_graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/kaminpar.h"
 
-#include "kaminpar-common/assert.h"
 #include "kaminpar-common/datastructures/static_array.h"
-#include "kaminpar-common/degree_buckets.h"
 #include "kaminpar-common/ranges.h"
 
 namespace kaminpar::shm {
-class Graph {
+namespace graph {
+template <typename Lambda> decltype(auto) reified(const AbstractGraph *abstract_graph, Lambda &&l) {
+  if (const auto *graph = dynamic_cast<const CSRGraph *>(abstract_graph); graph != nullptr) {
+    return l(*graph);
+  } else if (auto *graph = dynamic_cast<const CompactCSRGraph *>(abstract_graph);
+             graph != nullptr) {
+    return l(*graph);
+  } else if (auto *graph = dynamic_cast<const CompressedGraph *>(abstract_graph);
+             graph != nullptr) {
+    return l(*graph);
+  }
+
+  __builtin_unreachable();
+}
+} // namespace graph
+
+class Graph : public AbstractGraph {
 public:
   // Data types used by this graph
-  using NodeID = ::kaminpar::shm::NodeID;
-  using NodeWeight = ::kaminpar::shm::NodeWeight;
-  using EdgeID = ::kaminpar::shm::EdgeID;
-  using EdgeWeight = ::kaminpar::shm::EdgeWeight;
-
-  // Tag for the sequential ctor.
-  struct seq {};
+  using AbstractGraph::EdgeID;
+  using AbstractGraph::EdgeWeight;
+  using AbstractGraph::NodeID;
+  using AbstractGraph::NodeWeight;
 
   Graph() = default;
 
-  Graph(
-      StaticArray<EdgeID> nodes,
-      StaticArray<NodeID> edges,
-      StaticArray<NodeWeight> node_weights = {},
-      StaticArray<EdgeWeight> edge_weights = {},
-      bool sorted = false
-  );
-
-  Graph(
-      seq,
-      StaticArray<EdgeID> nodes,
-      StaticArray<NodeID> edges,
-      StaticArray<NodeWeight> node_weights = {},
-      StaticArray<EdgeWeight> edge_weights = {},
-      bool sorted = false
-  );
+  Graph(std::unique_ptr<AbstractGraph> graph);
 
   Graph(const Graph &) = delete;
   Graph &operator=(const Graph &) = delete;
@@ -59,244 +61,179 @@ class Graph {
   Graph(Graph &&) noexcept = default;
   Graph &operator=(Graph &&) noexcept = default;
 
-  //
-  // Access to raw data members
-  //
+  ~Graph() override = default;
 
-  [[nodiscard]] inline StaticArray<EdgeID> &raw_nodes() {
-    return _nodes;
+  // Access to the wrapped graph
+  [[nodiscard]] const AbstractGraph *underlying_graph() const {
+    return _underlying_graph.get();
   }
 
-  [[nodiscard]] inline const StaticArray<EdgeID> &raw_nodes() const {
-    return _nodes;
+  [[nodiscard]] AbstractGraph *underlying_graph() {
+    return _underlying_graph.get();
   }
 
-  [[nodiscard]] inline StaticArray<NodeID> &raw_edges() {
-    return _edges;
+  template <typename Lambda> decltype(auto) reified(Lambda &&l) const {
+    return graph::reified(underlying_graph(), std::forward<Lambda>(l));
   }
 
-  [[nodiscard]] inline const StaticArray<NodeID> &raw_edges() const {
-    return _edges;
+  // Size of the graph
+  [[nodiscard]] inline NodeID n() const final {
+    return _underlying_graph->n();
   }
 
-  [[nodiscard]] inline StaticArray<NodeWeight> &raw_node_weights() {
-    return _node_weights;
+  [[nodiscard]] inline EdgeID m() const final {
+    return _underlying_graph->m();
   }
 
-  [[nodiscard]] inline const StaticArray<NodeWeight> &raw_node_weights() const {
-    return _node_weights;
+  // Node and edge weights
+  [[nodiscard]] inline bool node_weighted() const final {
+    return _underlying_graph->node_weighted();
   }
 
-  [[nodiscard]] inline StaticArray<EdgeWeight> &raw_edge_weights() {
-    return _edge_weights;
+  [[nodiscard]] inline NodeWeight node_weight(const NodeID u) const final {
+    return _underlying_graph->node_weight(u);
   }
 
-  [[nodiscard]] inline const StaticArray<EdgeWeight> &raw_edge_weights() const {
-    return _edge_weights;
+  [[nodiscard]] inline NodeWeight max_node_weight() const final {
+    return _underlying_graph->max_node_weight();
   }
 
-  [[nodiscard]] inline StaticArray<EdgeID> &&take_raw_nodes() {
-    return std::move(_nodes);
+  [[nodiscard]] inline NodeWeight total_node_weight() const final {
+    return _underlying_graph->total_node_weight();
   }
 
-  [[nodiscard]] inline StaticArray<NodeID> &&take_raw_edges() {
-    return std::move(_edges);
-  }
-
-  [[nodiscard]] inline StaticArray<NodeWeight> &&take_raw_node_weights() {
-    return std::move(_node_weights);
-  }
-
-  [[nodiscard]] inline StaticArray<EdgeWeight> &&take_raw_edge_weights() {
-    return std::move(_edge_weights);
-  }
-
-  //
-  // Node weights
-  //
-
-  [[nodiscard]] inline bool node_weighted() const {
-    return static_cast<NodeWeight>(n()) != total_node_weight();
+  [[nodiscard]] inline bool edge_weighted() const final {
+    return _underlying_graph->edge_weighted();
   }
 
-  [[nodiscard]] inline NodeWeight total_node_weight() const {
-    return _total_node_weight;
+  [[nodiscard]] inline EdgeWeight edge_weight(const EdgeID e) const final {
+    return _underlying_graph->edge_weight(e);
   }
 
-  [[nodiscard]] inline NodeWeight max_node_weight() const {
-    return _max_node_weight;
+  [[nodiscard]] inline EdgeWeight total_edge_weight() const final {
+    return _underlying_graph->total_edge_weight();
   }
 
-  [[nodiscard]] inline NodeWeight node_weight(const NodeID u) const {
-    KASSERT(!node_weighted() || u < _node_weights.size());
-    return node_weighted() ? _node_weights[u] : 1;
+  // Low-level access to the graph structure
+  [[nodiscard]] inline NodeID max_degree() const final {
+    return _underlying_graph->max_degree();
   }
 
-  //
-  // Edge weights
-  //
-
-  [[nodiscard]] inline bool edge_weighted() const {
-    return static_cast<EdgeWeight>(m()) != total_edge_weight();
-  }
-
-  [[nodiscard]] inline EdgeWeight total_edge_weight() const {
-    return _total_edge_weight;
-  }
-
-  [[nodiscard]] inline EdgeWeight edge_weight(const EdgeID e) const {
-    KASSERT(!edge_weighted() || e < _edge_weights.size());
-    return edge_weighted() ? _edge_weights[e] : 1;
-  }
-
-  //
-  // Graph properties
-  //
-
-  [[nodiscard]] inline NodeID n() const {
-    return static_cast<NodeID>(_nodes.size() - 1);
+  [[nodiscard]] inline NodeID degree(const NodeID u) const final {
+    return _underlying_graph->degree(u);
   }
 
-  [[nodiscard]] inline EdgeID m() const {
-    return static_cast<EdgeID>(_edges.size());
+  // Iterators for nodes / edges
+  [[nodiscard]] inline IotaRange<NodeID> nodes() const final {
+    return _underlying_graph->nodes();
   }
 
-  //
-  // Low-level graph structure
-  //
-
-  [[nodiscard]] inline NodeID edge_target(const EdgeID e) const {
-    KASSERT(e < _edges.size());
-    return _edges[e];
+  [[nodiscard]] inline IotaRange<EdgeID> edges() const final {
+    return _underlying_graph->edges();
   }
 
-  [[nodiscard]] inline NodeID degree(const NodeID u) const {
-    KASSERT(u + 1 < _nodes.size());
-    return static_cast<NodeID>(_nodes[u + 1] - _nodes[u]);
+  // Parallel iteration
+  template <typename Lambda> inline void pfor_nodes(Lambda &&l) const {
+    reified([&](auto &graph) { graph.pfor_nodes(std::forward<Lambda>(l)); });
   }
 
-  [[nodiscard]] inline EdgeID first_edge(const NodeID u) const {
-    KASSERT(u + 1 < _nodes.size());
-    return _nodes[u];
+  template <typename Lambda> inline void pfor_edges(Lambda &&l) const {
+    reified([&](auto &graph) { graph.pfor_edges(std::forward<Lambda>(l)); });
   }
 
-  [[nodiscard]] inline EdgeID first_invalid_edge(const NodeID u) const {
-    KASSERT(u + 1 < _nodes.size());
-    return _nodes[u + 1];
+  // Graph operations
+  [[nodiscard]] inline decltype(auto) incident_edges(const NodeID u) const {
+    return reified([&](auto &graph) { return graph.incident_edges(u); });
   }
 
-  //
-  // Parallel iteration
-  //
+  [[nodiscard]] inline decltype(auto) adjacent_nodes(const NodeID u) const {
+    if (const auto *graph = dynamic_cast<const CSRGraph *>(_underlying_graph.get());
+        graph != nullptr) {
+      return graph->adjacent_nodes(u);
+    }
 
-  template <typename Lambda> inline void pfor_nodes(Lambda &&l) const {
-    tbb::parallel_for(static_cast<NodeID>(0), n(), std::forward<Lambda>(l));
+    throw std::runtime_error("This operation is only available for csr graphs.");
   }
 
-  template <typename Lambda> inline void pfor_edges(Lambda &&l) const {
-    tbb::parallel_for(static_cast<EdgeID>(0), m(), std::forward<Lambda>(l));
+  template <typename Lambda> inline void adjacent_nodes(const NodeID u, Lambda &&l) const {
+    reified([&](auto &graph) { graph.adjacent_nodes(u, std::forward<Lambda>(l)); });
   }
 
-  //
-  // Sequential iteration
-  //
+  [[nodiscard]] inline decltype(auto) neighbors(const NodeID u) const {
+    if (const auto *graph = dynamic_cast<const CSRGraph *>(_underlying_graph.get());
+        graph != nullptr) {
+      return graph->neighbors(u);
+    }
 
-  [[nodiscard]] inline IotaRange<NodeID> nodes() const {
-    return {static_cast<NodeID>(0), n()};
+    throw std::runtime_error("This operation is only available for csr graphs.");
   }
 
-  [[nodiscard]] inline IotaRange<EdgeID> edges() const {
-    return {static_cast<EdgeID>(0), m()};
+  template <typename Lambda> inline void neighbors(const NodeID u, Lambda &&l) const {
+    reified([&](auto &graph) { graph.neighbors(u, std::forward<Lambda>(l)); });
   }
 
-  [[nodiscard]] inline IotaRange<EdgeID> incident_edges(const NodeID u) const {
-    KASSERT(u + 1 < _nodes.size());
-    return {_nodes[u], _nodes[u + 1]};
+  template <typename Lambda>
+  inline void neighbors(const NodeID u, const NodeID max_neighbor_count, Lambda &&l) const {
+    reified([&](auto &graph) { graph.neighbors(u, max_neighbor_count, std::forward<Lambda>(l)); });
   }
 
-  [[nodiscard]] inline auto adjacent_nodes(const NodeID u) const {
-    KASSERT(u + 1 < _nodes.size());
-    return TransformedIotaRange(_nodes[u], _nodes[u + 1], [this](const EdgeID e) {
-      return this->edge_target(e);
+  template <typename Lambda>
+  inline void pfor_neighbors(
+      const NodeID u, const NodeID max_neighbor_count, const NodeID grainsize, Lambda &&l
+  ) const {
+    reified([&](auto &graph) {
+      graph.pfor_neighbors(u, max_neighbor_count, grainsize, std::forward<Lambda>(l));
     });
   }
 
-  [[nodiscard]] inline auto neighbors(const NodeID u) const {
-    KASSERT(u + 1 < _nodes.size());
-    return TransformedIotaRange(_nodes[u], _nodes[u + 1], [this](const EdgeID e) {
-      return std::make_pair(e, this->edge_target(e));
-    });
-  }
-
-  //
   // Graph permutation
-  //
-
-  inline void set_permutation(StaticArray<NodeID> permutation) {
-    _permutation = std::move(permutation);
+  inline void set_permutation(StaticArray<NodeID> permutation) final {
+    _underlying_graph->set_permutation(std::move(permutation));
   }
 
-  [[nodiscard]] inline bool permuted() const {
-    return !_permutation.empty();
+  [[nodiscard]] inline bool permuted() const final {
+    return _underlying_graph->permuted();
   }
 
-  [[nodiscard]] inline NodeID map_original_node(const NodeID u) const {
-    KASSERT(u < _permutation.size());
-    return _permutation[u];
+  [[nodiscard]] inline NodeID map_original_node(const NodeID u) const final {
+    return _underlying_graph->map_original_node(u);
   }
 
-  inline StaticArray<NodeID> &&take_raw_permutation() {
-    return std::move(_permutation);
+  [[nodiscard]] inline StaticArray<NodeID> &&take_raw_permutation() final {
+    return _underlying_graph->take_raw_permutation();
   }
 
-  //
   // Degree buckets
-  //
-
-  [[nodiscard]] inline NodeID bucket_size(const std::size_t bucket) const {
-    KASSERT(bucket + 1 < _buckets.size());
-    return _buckets[bucket + 1] - _buckets[bucket];
+  [[nodiscard]] inline std::size_t bucket_size(const std::size_t bucket) const final {
+    return _underlying_graph->bucket_size(bucket);
   }
 
-  [[nodiscard]] inline NodeID first_node_in_bucket(const std::size_t bucket) const {
-    return _buckets[bucket];
+  [[nodiscard]] inline NodeID first_node_in_bucket(const std::size_t bucket) const final {
+    return _underlying_graph->first_node_in_bucket(bucket);
   }
 
-  [[nodiscard]] inline NodeID first_invalid_node_in_bucket(const std::size_t bucket) const {
-    return first_node_in_bucket(bucket + 1);
+  [[nodiscard]] inline NodeID first_invalid_node_in_bucket(const std::size_t bucket) const final {
+    return _underlying_graph->first_invalid_node_in_bucket(bucket);
   }
 
-  [[nodiscard]] inline std::size_t number_of_buckets() const {
-    return _number_of_buckets;
+  [[nodiscard]] inline std::size_t number_of_buckets() const final {
+    return _underlying_graph->number_of_buckets();
   }
 
-  [[nodiscard]] inline bool sorted() const {
-    return _sorted;
+  [[nodiscard]] inline bool sorted() const final {
+    return _underlying_graph->sorted();
   }
 
-  void update_total_node_weight();
+  inline void update_total_node_weight() final {
+    _underlying_graph->update_total_node_weight();
+  }
 
 private:
-  void init_degree_buckets();
-
-  StaticArray<EdgeID> _nodes;
-  StaticArray<NodeID> _edges;
-  StaticArray<NodeWeight> _node_weights;
-  StaticArray<EdgeWeight> _edge_weights;
-
-  NodeWeight _total_node_weight = kInvalidNodeWeight;
-  EdgeWeight _total_edge_weight = kInvalidEdgeWeight;
-  NodeWeight _max_node_weight = kInvalidNodeWeight;
-
-  StaticArray<NodeID> _permutation;
-  bool _sorted;
-  std::vector<NodeID> _buckets = std::vector<NodeID>(kNumberOfDegreeBuckets<NodeID> + 1);
-  std::size_t _number_of_buckets = 0;
+  std::unique_ptr<AbstractGraph> _underlying_graph;
 };
 
 namespace debug {
-bool validate_graph(const Graph &graph, bool undirected = true, NodeID num_pseudo_nodes = 0);
 void print_graph(const Graph &graph);
-Graph sort_neighbors(Graph graph);
 } // namespace debug
+
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/graph_delegate.h b/kaminpar-shm/datastructures/graph_delegate.h
index aea8a2e2..ae647d11 100644
--- a/kaminpar-shm/datastructures/graph_delegate.h
+++ b/kaminpar-shm/datastructures/graph_delegate.h
@@ -10,11 +10,12 @@
 #include <cstddef>
 #include <utility>
 
-#include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/kaminpar.h"
 
+#include "kaminpar-common/ranges.h"
+
 namespace kaminpar::shm {
-class GraphDelegate {
+template <class Graph> class GraphDelegate {
 public:
   GraphDelegate(const Graph *graph) : _graph(graph) {}
 
@@ -26,6 +27,10 @@ class GraphDelegate {
     return *_graph;
   }
 
+  template <typename Lambda> decltype(auto) reified(Lambda &&l) const {
+    return _graph->reified(std::forward<Lambda>(l));
+  }
+
   //
   // Node weights
   //
@@ -78,22 +83,10 @@ class GraphDelegate {
   // Low-level graph structure
   //
 
-  [[nodiscard]] inline NodeID edge_target(const EdgeID e) const {
-    return _graph->edge_target(e);
-  }
-
   [[nodiscard]] inline NodeID degree(const NodeID u) const {
     return _graph->degree(u);
   }
 
-  [[nodiscard]] inline EdgeID first_edge(const NodeID u) const {
-    return _graph->first_edge(u);
-  }
-
-  [[nodiscard]] inline EdgeID first_invalid_edge(const NodeID u) const {
-    return _graph->first_invalid_edge(u);
-  }
-
   //
   // Parallel iteration
   //
@@ -126,10 +119,23 @@ class GraphDelegate {
     return _graph->adjacent_nodes(u);
   }
 
+  template <typename Lambda> inline auto adjacent_nodes(const NodeID u, Lambda &&l) const {
+    return _graph->adjacent_nodes(u, std::forward<Lambda>(l));
+  }
+
   [[nodiscard]] inline auto neighbors(const NodeID u) const {
     return _graph->neighbors(u);
   }
 
+  template <typename Lambda> inline auto neighbors(const NodeID u, Lambda &&l) const {
+    return _graph->neighbors(u, std::numeric_limits<NodeID>::max(), std::forward<Lambda>(l));
+  }
+
+  template <typename Lambda>
+  inline auto neighbors(const NodeID u, const NodeID max_neighbor_count, Lambda &&l) const {
+    return _graph->neighbors(u, max_neighbor_count, std::forward<Lambda>(l));
+  }
+
   //
   // Graph permutation
   //
diff --git a/kaminpar-shm/datastructures/partitioned_graph.cc b/kaminpar-shm/datastructures/partitioned_graph.cc
deleted file mode 100644
index c2cc5d54..00000000
--- a/kaminpar-shm/datastructures/partitioned_graph.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/*******************************************************************************
- * Dynamic partition wrapper for a static graph.
- *
- * @file:   partitioned_graph.cc
- * @author: Daniel Seemaier
- * @date:   21.09.2021
- ******************************************************************************/
-#include "kaminpar-shm/datastructures/partitioned_graph.h"
-
-#include <tbb/enumerable_thread_specific.h>
-#include <tbb/parallel_for.h>
-
-#include "kaminpar-common/datastructures/static_array.h"
-
-namespace kaminpar::shm {
-PartitionedGraph::PartitionedGraph(const Graph &graph, BlockID k, StaticArray<BlockID> partition)
-    : GraphDelegate(&graph),
-      _k(k),
-      _partition(std::move(partition)),
-      _block_weights(k) {
-  if (graph.n() > 0 && _partition.empty()) {
-    _partition.resize(_graph->n(), kInvalidBlockID);
-  }
-
-  KASSERT(_partition.size() == graph.n());
-
-  init_block_weights_par();
-}
-
-PartitionedGraph::PartitionedGraph(
-    seq, const Graph &graph, BlockID k, StaticArray<BlockID> partition
-)
-    : GraphDelegate(&graph),
-      _k(k),
-      _partition(std::move(partition)),
-      _block_weights(k) {
-  if (graph.n() > 0 && _partition.empty()) {
-    _partition.resize(_graph->n(), kInvalidBlockID);
-  }
-
-  KASSERT(_partition.size() == graph.n());
-
-  init_block_weights_seq();
-}
-
-void PartitionedGraph::init_block_weights_par() {
-  tbb::enumerable_thread_specific<StaticArray<BlockWeight>> block_weights_ets([&] {
-    return StaticArray<BlockWeight>(k());
-  });
-
-  tbb::parallel_for(tbb::blocked_range<NodeID>(0, n()), [&](const tbb::blocked_range<NodeID> &r) {
-    auto &block_weights = block_weights_ets.local();
-    for (NodeID u = r.begin(); u != r.end(); ++u) {
-      if (const BlockID b = block(u); b != kInvalidBlockID) {
-        block_weights[b] += node_weight(u);
-      }
-    }
-  });
-
-  tbb::parallel_for<BlockID>(0, k(), [&](const BlockID b) {
-    BlockWeight sum = 0;
-    for (const StaticArray<BlockWeight> &block_weights : block_weights_ets) {
-      sum += block_weights[b];
-    }
-    _block_weights[b] = sum;
-  });
-}
-
-void PartitionedGraph::init_block_weights_seq() {
-  for (const NodeID u : nodes()) {
-    if (const BlockID b = block(u); b != kInvalidBlockID) {
-      _block_weights[b] += node_weight(u);
-    }
-  }
-}
-} // namespace kaminpar::shm
diff --git a/kaminpar-shm/datastructures/partitioned_graph.h b/kaminpar-shm/datastructures/partitioned_graph.h
index fe65236e..c49b60bb 100644
--- a/kaminpar-shm/datastructures/partitioned_graph.h
+++ b/kaminpar-shm/datastructures/partitioned_graph.h
@@ -8,13 +8,14 @@
 #pragma once
 
 #include <utility>
-#include <vector>
+
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
 
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/datastructures/graph_delegate.h"
 
 #include "kaminpar-common/datastructures/static_array.h"
-#include "kaminpar-common/parallel/atomic.h"
 
 namespace kaminpar::shm {
 /*!
@@ -27,12 +28,12 @@ namespace kaminpar::shm {
  * If an object of this class is constructed without partition, all nodes are
  * marked unassigned, i.e., are placed in block `kInvalidBlockID`.
  */
-class PartitionedGraph : public GraphDelegate {
+template <typename Graph> class GenericPartitionedGraph : public GraphDelegate<Graph> {
 public:
-  using NodeID = Graph::NodeID;
-  using NodeWeight = Graph::NodeWeight;
-  using EdgeID = Graph::EdgeID;
-  using EdgeWeight = Graph::EdgeWeight;
+  using NodeID = typename Graph::NodeID;
+  using NodeWeight = typename Graph::NodeWeight;
+  using EdgeID = typename Graph::EdgeID;
+  using EdgeWeight = typename Graph::EdgeWeight;
   using BlockID = ::kaminpar::shm::BlockID;
   using BlockWeight = ::kaminpar::shm::BlockWeight;
 
@@ -40,20 +41,44 @@ class PartitionedGraph : public GraphDelegate {
   struct seq {};
 
   // Parallel ctor: use parallel loops to compute block weights.
-  PartitionedGraph(const Graph &graph, BlockID k, StaticArray<BlockID> partition = {});
+  GenericPartitionedGraph(const Graph &graph, BlockID k, StaticArray<BlockID> partition = {})
+      : GraphDelegate<Graph>(&graph),
+        _k(k),
+        _partition(std::move(partition)),
+        _block_weights(k) {
+    if (graph.n() > 0 && _partition.empty()) {
+      _partition.resize(graph.n(), kInvalidBlockID);
+    }
+
+    KASSERT(_partition.size() == graph.n());
+
+    init_block_weights_par();
+  }
 
   // Sequential ctor: use sequential loops to compute block weights.
-  PartitionedGraph(seq, const Graph &graph, BlockID k, StaticArray<BlockID> partition = {});
+  GenericPartitionedGraph(seq, const Graph &graph, BlockID k, StaticArray<BlockID> partition = {})
+      : GraphDelegate<Graph>(&graph),
+        _k(k),
+        _partition(std::move(partition)),
+        _block_weights(k) {
+    if (graph.n() > 0 && _partition.empty()) {
+      _partition.resize(graph.n(), kInvalidBlockID);
+    }
+
+    KASSERT(_partition.size() == graph.n());
+
+    init_block_weights_seq();
+  }
 
   // Dummy ctor to make the class default-constructible for convenience.
   // @todo Should we get rid of this ctor?
-  PartitionedGraph() : GraphDelegate(nullptr) {}
+  GenericPartitionedGraph() : GraphDelegate<Graph>(nullptr) {}
 
-  PartitionedGraph(const PartitionedGraph &) = delete;
-  PartitionedGraph &operator=(const PartitionedGraph &) = delete;
+  GenericPartitionedGraph(const GenericPartitionedGraph &) = delete;
+  GenericPartitionedGraph &operator=(const GenericPartitionedGraph &) = delete;
 
-  PartitionedGraph(PartitionedGraph &&) noexcept = default;
-  PartitionedGraph &operator=(PartitionedGraph &&other) noexcept = default;
+  GenericPartitionedGraph(GenericPartitionedGraph &&) noexcept = default;
+  GenericPartitionedGraph &operator=(GenericPartitionedGraph &&other) noexcept = default;
 
   /**
    * Attempts to move node `u` from block `from` to block `to` while preserving the balance
@@ -70,13 +95,13 @@ class PartitionedGraph : public GraphDelegate {
    */
   [[nodiscard]] bool
   move(const NodeID u, const BlockID from, const BlockID to, const BlockWeight max_to_weight) {
-    KASSERT(u < n());
+    KASSERT(u < this->n());
     KASSERT(from < k());
     KASSERT(to < k());
     KASSERT(block(u) == from);
     KASSERT(from != to);
 
-    if (move_block_weight(from, to, node_weight(u), max_to_weight)) {
+    if (move_block_weight(from, to, this->node_weight(u), max_to_weight)) {
       set_block<false>(u, to);
       return true;
     }
@@ -98,11 +123,11 @@ class PartitionedGraph : public GraphDelegate {
    * @param new_b Block the node is moved to.
    */
   template <bool update_block_weights = true> void set_block(const NodeID u, const BlockID to) {
-    KASSERT(u < n(), "invalid node id " << u);
+    KASSERT(u < this->n(), "invalid node id " << u);
     KASSERT(to < k(), "invalid block id " << to << " for node " << u);
 
     if constexpr (update_block_weights) {
-      const NodeWeight weight = node_weight(u);
+      const NodeWeight weight = this->node_weight(u);
       if (const BlockID from = block(u); from != kInvalidBlockID) {
         decrease_block_weight(from, weight);
       }
@@ -213,16 +238,50 @@ class PartitionedGraph : public GraphDelegate {
   }
 
   [[nodiscard]] inline BlockID block(const NodeID u) const {
-    KASSERT(u < n());
+    KASSERT(u < this->n());
     return __atomic_load_n(&_partition[u], __ATOMIC_RELAXED);
   }
 
 private:
-  void init_block_weights_par();
-  void init_block_weights_seq();
+  void init_block_weights_par() {
+    tbb::enumerable_thread_specific<StaticArray<BlockWeight>> block_weights_ets([&] {
+      return StaticArray<BlockWeight>(k());
+    });
+
+    tbb::parallel_for(
+        tbb::blocked_range<NodeID>(0, this->n()),
+        [&](const tbb::blocked_range<NodeID> &r) {
+          auto &block_weights = block_weights_ets.local();
+          for (NodeID u = r.begin(); u != r.end(); ++u) {
+            if (const BlockID b = block(u); b != kInvalidBlockID) {
+              block_weights[b] += this->node_weight(u);
+            }
+          }
+        }
+    );
+
+    tbb::parallel_for<BlockID>(0, k(), [&](const BlockID b) {
+      BlockWeight sum = 0;
+      for (const StaticArray<BlockWeight> &block_weights : block_weights_ets) {
+        sum += block_weights[b];
+      }
+      _block_weights[b] = sum;
+    });
+  }
+
+  void init_block_weights_seq() {
+    for (const NodeID u : this->nodes()) {
+      if (const BlockID b = block(u); b != kInvalidBlockID) {
+        _block_weights[b] += this->node_weight(u);
+      }
+    }
+  }
 
-  BlockID _k;
+  BlockID _k = 0;
   StaticArray<BlockID> _partition;
   StaticArray<BlockWeight> _block_weights;
 };
+
+using PartitionedGraph = GenericPartitionedGraph<Graph>;
+using PartitionedCSRGraph = GenericPartitionedGraph<CSRGraph>;
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/factories.cc b/kaminpar-shm/factories.cc
index 974c1b9d..5a8d5357 100644
--- a/kaminpar-shm/factories.cc
+++ b/kaminpar-shm/factories.cc
@@ -16,10 +16,12 @@
 #include "kaminpar-shm/partitioning/rb/rb_multilevel.h"
 
 // Clusterings
-#include "kaminpar-shm/coarsening/cluster_coarsener.h"
-#include "kaminpar-shm/coarsening/lp_clustering.h"
+#include "kaminpar-shm/coarsening/clustering/legacy_lp_clusterer.h"
+#include "kaminpar-shm/coarsening/clustering/lp_clusterer.h"
+#include "kaminpar-shm/coarsening/clustering/noop_clusterer.h"
 
 // Coarsening
+#include "kaminpar-shm/coarsening/cluster_coarsener.h"
 #include "kaminpar-shm/coarsening/noop_coarsener.h"
 
 // Refinement
@@ -27,11 +29,14 @@
 #include "kaminpar-shm/refinement/balancer/greedy_balancer.h"
 #include "kaminpar-shm/refinement/fm/fm_refiner.h"
 #include "kaminpar-shm/refinement/jet/jet_refiner.h"
+#include "kaminpar-shm/refinement/lp/legacy_lp_refiner.h"
 #include "kaminpar-shm/refinement/lp/lp_refiner.h"
 #include "kaminpar-shm/refinement/multi_refiner.h"
 
 namespace kaminpar::shm::factory {
 std::unique_ptr<Partitioner> create_partitioner(const Graph &graph, const Context &ctx) {
+  SCOPED_HEAP_PROFILER("Create partitioner");
+
   switch (ctx.partitioning.mode) {
   case PartitioningMode::DEEP:
     return std::make_unique<DeepMultilevelPartitioner>(graph, ctx);
@@ -46,17 +51,32 @@ std::unique_ptr<Partitioner> create_partitioner(const Graph &graph, const Contex
   __builtin_unreachable();
 }
 
-std::unique_ptr<Coarsener> create_coarsener(const Graph &graph, const CoarseningContext &c_ctx) {
-  SCOPED_TIMER("Allocation");
-
-  switch (c_ctx.algorithm) {
+std::unique_ptr<Clusterer> create_clusterer(const Context &ctx) {
+  switch (ctx.coarsening.clustering.algorithm) {
   case ClusteringAlgorithm::NOOP:
-    return std::make_unique<NoopCoarsener>();
+    return std::make_unique<NoopClusterer>();
 
   case ClusteringAlgorithm::LABEL_PROPAGATION:
-    return std::make_unique<ClusteringCoarsener>(
-        std::make_unique<LPClustering>(graph.n(), c_ctx), graph, c_ctx
-    );
+    return std::make_unique<LPClustering>(ctx.coarsening);
+
+  case ClusteringAlgorithm::LEGACY_LABEL_PROPAGATION:
+    return std::make_unique<LegacyLPClustering>(ctx.coarsening);
+  }
+
+  __builtin_unreachable();
+}
+
+std::unique_ptr<Coarsener> create_coarsener(const Context &ctx) {
+  return create_coarsener(ctx, ctx.partition);
+}
+
+std::unique_ptr<Coarsener> create_coarsener(const Context &ctx, const PartitionContext &p_ctx) {
+  switch (ctx.coarsening.algorithm) {
+  case CoarseningAlgorithm::NOOP:
+    return std::make_unique<NoopCoarsener>();
+
+  case CoarseningAlgorithm::CLUSTERING:
+    return std::make_unique<ClusteringCoarsener>(ctx, p_ctx);
   }
 
   __builtin_unreachable();
@@ -71,6 +91,9 @@ std::unique_ptr<Refiner> create_refiner(const Context &ctx, const RefinementAlgo
   case RefinementAlgorithm::LABEL_PROPAGATION:
     return std::make_unique<LabelPropagationRefiner>(ctx);
 
+  case RefinementAlgorithm::LEGACY_LABEL_PROPAGATION:
+    return std::make_unique<LegacyLabelPropagationRefiner>(ctx);
+
   case RefinementAlgorithm::GREEDY_BALANCER:
     return std::make_unique<GreedyBalancer>(ctx);
 
@@ -89,6 +112,8 @@ std::unique_ptr<Refiner> create_refiner(const Context &ctx, const RefinementAlgo
 } // namespace
 
 std::unique_ptr<Refiner> create_refiner(const Context &ctx) {
+  SCOPED_HEAP_PROFILER("Refiner Allocation");
+  SCOPED_TIMER("Refinement");
   SCOPED_TIMER("Allocation");
 
   if (ctx.refinement.algorithms.empty()) {
diff --git a/kaminpar-shm/factories.h b/kaminpar-shm/factories.h
index 51d1e886..eb0d11c3 100644
--- a/kaminpar-shm/factories.h
+++ b/kaminpar-shm/factories.h
@@ -8,14 +8,20 @@
  ******************************************************************************/
 #pragma once
 
+#include "kaminpar-shm/coarsening/clusterer.h"
 #include "kaminpar-shm/coarsening/coarsener.h"
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
 #include "kaminpar-shm/partitioning/partitioner.h"
 #include "kaminpar-shm/refinement/refiner.h"
 
 namespace kaminpar::shm::factory {
 std::unique_ptr<Partitioner> create_partitioner(const Graph &graph, const Context &ctx);
-std::unique_ptr<Coarsener> create_coarsener(const Graph &graph, const CoarseningContext &c_ctx);
+
+std::unique_ptr<Clusterer> create_clusterer(const Context &ctx);
+
+std::unique_ptr<Coarsener> create_coarsener(const Context &ctx, const PartitionContext &p_ctx);
+std::unique_ptr<Coarsener> create_coarsener(const Context &ctx);
+
 std::unique_ptr<Refiner> create_refiner(const Context &ctx);
 } // namespace kaminpar::shm::factory
diff --git a/kaminpar-shm/graphutils/cluster_contraction.h b/kaminpar-shm/graphutils/cluster_contraction.h
deleted file mode 100644
index 7841e193..00000000
--- a/kaminpar-shm/graphutils/cluster_contraction.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- * Contracts clusterings and constructs the coarse graph.
- *
- * @file:   cluster_contraction.h
- * @author: Daniel Seemaier
- * @date:   21.09.2021
- ******************************************************************************/
-#pragma once
-
-#include "kaminpar-shm/datastructures/graph.h"
-#include "kaminpar-shm/kaminpar.h"
-
-#include "kaminpar-common/datastructures/scalable_vector.h"
-#include "kaminpar-common/datastructures/ts_navigable_linked_list.h"
-#include "kaminpar-common/parallel/atomic.h"
-
-namespace kaminpar::shm::graph {
-namespace contraction {
-struct Edge {
-  NodeID target;
-  EdgeWeight weight;
-};
-
-struct MemoryContext {
-  scalable_vector<NodeID> buckets;
-  scalable_vector<parallel::Atomic<NodeID>> buckets_index;
-  scalable_vector<parallel::Atomic<NodeID>> leader_mapping;
-  scalable_vector<NavigationMarker<NodeID, Edge, scalable_vector>> all_buffered_nodes;
-};
-
-struct Result {
-  Graph graph;
-  scalable_vector<NodeID> mapping;
-  MemoryContext m_ctx;
-};
-} // namespace contraction
-
-contraction::Result contract(
-    const Graph &r, const scalable_vector<NodeID> &clustering, contraction::MemoryContext m_ctx = {}
-);
-
-contraction::Result contract(
-    const Graph &graph,
-    const scalable_vector<parallel::Atomic<NodeID>> &clustering,
-    contraction::MemoryContext m_ctx = {}
-);
-} // namespace kaminpar::shm::graph
diff --git a/kaminpar-shm/graphutils/permutator.cc b/kaminpar-shm/graphutils/permutator.cc
index 95387f71..734e4ceb 100644
--- a/kaminpar-shm/graphutils/permutator.cc
+++ b/kaminpar-shm/graphutils/permutator.cc
@@ -6,62 +6,45 @@
  ******************************************************************************/
 #include "kaminpar-shm/graphutils/permutator.h"
 
+#include <algorithm>
 #include <cmath>
 
 #include <tbb/enumerable_thread_specific.h>
 
 #include "kaminpar-common/assert.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/logger.h"
 #include "kaminpar-common/parallel/algorithm.h"
+#include "kaminpar-common/parallel/aligned_element.h"
 #include "kaminpar-common/timer.h"
 
 namespace kaminpar::shm::graph {
-namespace {
-std::pair<NodeID, NodeWeight> find_isolated_nodes_info(
-    const StaticArray<EdgeID> &nodes, const StaticArray<NodeWeight> &node_weights
-) {
-  KASSERT((node_weights.empty() || node_weights.size() + 1 == nodes.size()));
-
-  tbb::enumerable_thread_specific<NodeID> isolated_nodes;
-  tbb::enumerable_thread_specific<NodeWeight> isolated_nodes_weights;
-  const bool is_weighted = !node_weights.empty();
-
-  const NodeID n = nodes.size() - 1;
-  tbb::parallel_for(tbb::blocked_range<NodeID>(0, n), [&](const tbb::blocked_range<NodeID> &r) {
-    NodeID &local_isolated_nodes = isolated_nodes.local();
-    NodeWeight &local_isolated_weights = isolated_nodes_weights.local();
-
-    for (NodeID u = r.begin(); u != r.end(); ++u) {
-      if (nodes[u] == nodes[u + 1]) {
-        ++local_isolated_nodes;
-        local_isolated_weights += is_weighted ? node_weights[u] : 1;
-      }
-    }
-  });
-
-  return {isolated_nodes.combine(std::plus{}), isolated_nodes_weights.combine(std::plus{})};
-}
-} // namespace
 
 NodePermutations<StaticArray> rearrange_graph(
-    PartitionContext &p_ctx,
     StaticArray<EdgeID> &nodes,
     StaticArray<NodeID> &edges,
     StaticArray<NodeWeight> &node_weights,
     StaticArray<EdgeWeight> &edge_weights
 ) {
-  START_TIMER("Allocation");
-  StaticArray<EdgeID> tmp_nodes(nodes.size());
-  StaticArray<NodeID> tmp_edges(edges.size());
-  StaticArray<NodeWeight> tmp_node_weights(node_weights.size());
-  StaticArray<EdgeWeight> tmp_edge_weights(edge_weights.size());
+  START_HEAP_PROFILER("Temporal nodes and edges allocation");
+  START_TIMER("Allocation (noinit)");
+  RECORD("tmp_nodes") StaticArray<EdgeID> tmp_nodes(nodes.size(), static_array::noinit);
+  RECORD("tmp_edges") StaticArray<NodeID> tmp_edges(edges.size(), static_array::noinit);
+  RECORD("tmp_node_weights")
+  StaticArray<NodeWeight> tmp_node_weights(node_weights.size(), static_array::noinit);
+  RECORD("tmp_edge_weights")
+  StaticArray<EdgeWeight> tmp_edge_weights(edge_weights.size(), static_array::noinit);
   STOP_TIMER();
+  STOP_HEAP_PROFILER();
 
   // if we are about to remove all isolated nodes, we place them to the end of
   // the graph data structure this way, we can just cut them off without doing
   // further work
-  START_TIMER("Rearrange input graph");
+  START_HEAP_PROFILER("Rearrange input graph");
+  START_TIMER("Sort nodes by degree bucket");
   NodePermutations<StaticArray> permutations = sort_by_degree_buckets<>(nodes);
+  STOP_TIMER();
+  START_TIMER("Rearrange input graph");
   build_permuted_graph(
       nodes,
       edges,
@@ -78,6 +61,240 @@ NodePermutations<StaticArray> rearrange_graph(
   std::swap(node_weights, tmp_node_weights);
   std::swap(edge_weights, tmp_edge_weights);
   STOP_TIMER();
+  STOP_HEAP_PROFILER();
+
+  START_TIMER("Deallocation");
+  tmp_nodes.free();
+  tmp_edges.free();
+  tmp_node_weights.free();
+  tmp_edge_weights.free();
+  STOP_TIMER();
+
+  return permutations;
+}
+
+Graph rearrange_by_degree_buckets(CSRGraph &old_graph) {
+  SCOPED_TIMER("Rearrange by degree-buckets");
+
+  auto nodes = old_graph.take_raw_nodes();
+  auto edges = old_graph.take_raw_edges();
+  auto node_weights = old_graph.take_raw_node_weights();
+  auto edge_weights = old_graph.take_raw_edge_weights();
+
+  auto node_permutations = graph::rearrange_graph(nodes, edges, node_weights, edge_weights);
+
+  KASSERT(
+      [&] {
+        if (!node_weights.empty() && node_weights.size() + 1 < nodes.size()) {
+          LOG_WARNING << "node weights array is not empty, but smaller than the number of nodes";
+          return false;
+        }
+        if (!edge_weights.empty() && edge_weights.size() < edges.size()) {
+          LOG_WARNING << "edge weights array is not empty, but smaller than the number of edges";
+          return false;
+        }
+        for (NodeID u = 0; u + 1 < nodes.size(); ++u) {
+          if (nodes[u] > nodes[u + 1] || nodes[u + 1] > edges.size()) {
+            LOG_WARNING << "invalid nodes[] entry for node " << u;
+            return false;
+          }
+          for (EdgeID e = nodes[u]; e < nodes[u + 1]; ++e) {
+            const NodeID v = edges[e];
+            if (v + 1 > nodes.size()) {
+              LOG_WARNING << "neighbor " << v << " of node " << u << " is out of range";
+              return false;
+            }
+          }
+        }
+        return true;
+      }(),
+      "graph permutation produced invalid CSR graph",
+      assert::heavy
+  );
+
+  Graph new_graph(std::make_unique<CSRGraph>(
+      std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), true
+  ));
+  new_graph.set_permutation(std::move(node_permutations.old_to_new));
+
+  return new_graph;
+}
+
+// See https://devblogs.microsoft.com/oldnewthing/20170102-00/?p=95095
+template <typename S, typename T, typename U, typename V>
+static void apply_permutation(S *u, T *v, U &indices, V size) {
+  for (V i = 0; i < size; ++i) {
+    V current = i;
+
+    while (i != indices[current]) {
+      V next = indices[current];
+      std::swap(u[current], u[next]);
+      std::swap(v[current], v[next]);
+      indices[current] = current;
+      current = next;
+    }
+
+    indices[current] = current;
+  }
+}
+
+static void sort_by_compression(
+    const NodeID node,
+    NodeID *edges_begin,
+    NodeID *edges_end,
+    bool store_edge_weights,
+    tbb::enumerable_thread_specific<parallel::AlignedVec<std::vector<EdgeID>>> &permutation_ets,
+    EdgeWeight *edge_weights
+) {
+  const auto permutate = [&](NodeID *edges_begin, NodeID *edges_end, EdgeWeight *edge_weights) {
+    if constexpr (CompressedGraph::kIntervalEncoding) {
+      const NodeID local_degree = static_cast<NodeID>(edges_end - edges_begin);
+
+      if (local_degree < 2) {
+        return;
+      }
+
+      NodeID interval_len = 1;
+      NodeID prev_adjacent_node = *edges_begin;
+      NodeID *rot_begin = edges_begin;
+
+      EdgeWeight *rot_edge_weight_begin = edge_weights;
+      EdgeWeight *rot_edge_weight_end = edge_weights + 1;
+
+      for (NodeID *iter = edges_begin + 1; iter != edges_end; ++iter) {
+        const NodeID adjacent_node = *iter;
+
+        if (prev_adjacent_node + 1 == adjacent_node) {
+          interval_len++;
+
+          // The interval ends if there are no more nodes or the next node is not the increment of
+          // the current node.
+          if (iter + 1 == edges_end || *(iter + 1) != adjacent_node + 1) {
+            if (interval_len >= CompressedGraph::kIntervalLengthTreshold) {
+              NodeID *rot_end = iter + 1;
+              std::rotate(
+                  std::reverse_iterator(rot_end),
+                  std::reverse_iterator(rot_end) + interval_len,
+                  std::reverse_iterator(rot_begin)
+              );
+
+              if (store_edge_weights) {
+                std::rotate(
+                    std::reverse_iterator(rot_edge_weight_end + 1),
+                    std::reverse_iterator(rot_edge_weight_end + 1) + interval_len,
+                    std::reverse_iterator(rot_edge_weight_begin)
+                );
+              }
+
+              rot_begin += interval_len;
+              rot_edge_weight_begin += interval_len;
+            }
+
+            interval_len = 1;
+          }
+        }
+
+        prev_adjacent_node = adjacent_node;
+        rot_edge_weight_end++;
+      }
+    };
+  };
+
+  const NodeID degree = static_cast<NodeID>(edges_end - edges_begin);
+
+  if (store_edge_weights) {
+    auto &permutation = permutation_ets.local();
+    permutation.clear();
+    permutation.resize(degree);
+
+    for (EdgeID i = 0; i != degree; ++i) {
+      permutation[i] = i;
+    }
+
+    std::sort(permutation.begin(), permutation.end(), [&](const EdgeID u, const EdgeID v) {
+      return edges_begin[u] < edges_begin[v];
+    });
+
+    apply_permutation(edges_begin, edge_weights, permutation, static_cast<EdgeID>(degree));
+  } else {
+    std::sort(edges_begin, edges_end);
+  }
+
+  const bool split_neighbourhood = degree >= CompressedGraph::kHighDegreeThreshold;
+  if (split_neighbourhood) {
+    NodeID part_count = ((degree % CompressedGraph::kHighDegreePartLength) == 0)
+                            ? (degree / CompressedGraph::kHighDegreePartLength)
+                            : ((degree / CompressedGraph::kHighDegreePartLength) + 1);
+    NodeID last_part_length = ((degree % CompressedGraph::kHighDegreePartLength) == 0)
+                                  ? CompressedGraph::kHighDegreePartLength
+                                  : (degree % CompressedGraph::kHighDegreePartLength);
+
+    for (NodeID i = 0; i < part_count; ++i) {
+      NodeID *part_edges = edges_begin + i * CompressedGraph::kHighDegreePartLength;
+      EdgeWeight *part_edge_weights = edge_weights + i * CompressedGraph::kHighDegreePartLength;
+
+      const bool last_part = i + 1 == part_count;
+      NodeID part_length = last_part ? last_part_length : CompressedGraph::kHighDegreePartLength;
+
+      permutate(part_edges, part_edges + part_length, part_edge_weights);
+    }
+  } else {
+    permutate(edges_begin, edges_end, edge_weights);
+  }
+}
+
+void reorder_edges_by_compression(CSRGraph &graph) {
+  SCOPED_HEAP_PROFILER("Reorder edges of input graph");
+  SCOPED_TIMER("Reorder edges of input");
+
+  StaticArray<EdgeID> &raw_nodes = graph.raw_nodes();
+  StaticArray<NodeID> &raw_edges = graph.raw_edges();
+  StaticArray<EdgeWeight> &raw_edge_weights = graph.raw_edge_weights();
+
+  tbb::enumerable_thread_specific<parallel::AlignedVec<std::vector<EdgeID>>> permutation_ets;
+  graph.pfor_nodes([&](const NodeID node) {
+    NodeID *edges_begin = raw_edges.data() + raw_nodes[node];
+    NodeID *edges_end = raw_edges.data() + raw_nodes[node + 1];
+
+    const bool store_edge_weights = graph.edge_weighted();
+    EdgeWeight *edge_weights =
+        store_edge_weights ? (raw_edge_weights.data() + raw_nodes[node]) : nullptr;
+
+    sort_by_compression(
+        node, edges_begin, edges_end, store_edge_weights, permutation_ets, edge_weights
+    );
+  });
+}
+
+template <typename NodeContainer, typename NodeWeightContainer>
+std::pair<NodeID, NodeWeight>
+find_isolated_nodes_info(const NodeContainer &nodes, const NodeWeightContainer &node_weights) {
+  KASSERT((node_weights.empty() || node_weights.size() + 1 == nodes.size()));
+
+  tbb::enumerable_thread_specific<NodeID> isolated_nodes;
+  tbb::enumerable_thread_specific<NodeWeight> isolated_nodes_weights;
+  const bool is_weighted = !node_weights.empty();
+
+  const NodeID n = nodes.size() - 1;
+  tbb::parallel_for(tbb::blocked_range<NodeID>(0, n), [&](const tbb::blocked_range<NodeID> &r) {
+    NodeID &local_isolated_nodes = isolated_nodes.local();
+    NodeWeight &local_isolated_weights = isolated_nodes_weights.local();
+
+    for (NodeID u = r.begin(); u != r.end(); ++u) {
+      if (nodes[u] == nodes[u + 1]) {
+        ++local_isolated_nodes;
+        local_isolated_weights += is_weighted ? node_weights[u] : 1;
+      }
+    }
+  });
+
+  return {isolated_nodes.combine(std::plus{}), isolated_nodes_weights.combine(std::plus{})};
+}
+
+template <typename Graph>
+void remove_isolated_nodes_generic_graph(Graph &graph, PartitionContext &p_ctx) {
+  auto &nodes = graph.raw_nodes();
+  auto &node_weights = graph.raw_node_weights();
 
   const NodeWeight total_node_weight =
       node_weights.empty() ? nodes.size() - 1 : parallel::accumulate(node_weights, 0);
@@ -95,41 +312,60 @@ NodePermutations<StaticArray> rearrange_graph(
   p_ctx.n = new_n;
   p_ctx.total_node_weight = new_weight;
 
-  nodes.restrict(new_n + 1);
-  if (!node_weights.empty()) {
-    node_weights.restrict(new_n);
-  }
+  graph.remove_isolated_nodes(isolated_nodes);
+}
 
-  return permutations;
+void remove_isolated_nodes(Graph &graph, PartitionContext &p_ctx) {
+  SCOPED_TIMER("Remove isolated nodes");
+
+  if (auto *csr_graph = dynamic_cast<CSRGraph *>(graph.underlying_graph()); csr_graph != nullptr) {
+    remove_isolated_nodes_generic_graph(*csr_graph, p_ctx);
+  } else if (auto *compressed_graph = dynamic_cast<CompressedGraph *>(graph.underlying_graph());
+             compressed_graph != nullptr) {
+    remove_isolated_nodes_generic_graph(*compressed_graph, p_ctx);
+  }
 }
 
-NodeID integrate_isolated_nodes(Graph &graph, const double epsilon, Context &ctx) {
+template <typename Graph>
+NodeID integrate_isolated_nodes_generic_graph(Graph &graph, const double epsilon, Context &ctx) {
   const NodeID num_nonisolated_nodes = graph.n(); // this becomes the first isolated node
-  graph.raw_nodes().unrestrict();
-  graph.raw_node_weights().unrestrict();
-  graph.update_total_node_weight();
+
+  graph.integrate_isolated_nodes();
+
   const NodeID num_isolated_nodes = graph.n() - num_nonisolated_nodes;
 
   // note: max block weights should not change
   ctx.partition.epsilon = epsilon;
-  ctx.setup(graph);
 
   return num_isolated_nodes;
 }
 
+NodeID integrate_isolated_nodes(Graph &graph, double epsilon, Context &ctx) {
+  NodeID num_isolated_nodes;
+  if (auto *csr_graph = dynamic_cast<CSRGraph *>(graph.underlying_graph()); csr_graph != nullptr) {
+    num_isolated_nodes = integrate_isolated_nodes_generic_graph(*csr_graph, epsilon, ctx);
+
+  } else if (auto *compressed_graph = dynamic_cast<CompressedGraph *>(graph.underlying_graph());
+             compressed_graph != nullptr) {
+    num_isolated_nodes = integrate_isolated_nodes_generic_graph(*compressed_graph, epsilon, ctx);
+  }
+
+  ctx.setup(graph);
+  return num_isolated_nodes;
+}
+
 PartitionedGraph assign_isolated_nodes(
     PartitionedGraph p_graph, const NodeID num_isolated_nodes, const PartitionContext &p_ctx
 ) {
   const Graph &graph = p_graph.graph();
   const NodeID num_nonisolated_nodes = graph.n() - num_isolated_nodes;
 
-  StaticArray<BlockID> partition(graph.n()); // n() should include isolated nodes now
+  // The following call graph.n() should include isolated nodes now
+  RECORD("partition") StaticArray<BlockID> partition(graph.n());
   // copy partition of non-isolated nodes
-  tbb::parallel_for(
-      static_cast<NodeID>(0),
-      static_cast<NodeID>(num_nonisolated_nodes),
-      [&](const NodeID u) { partition[u] = p_graph.block(u); }
-  );
+  tbb::parallel_for<NodeID>(0, num_nonisolated_nodes, [&](const NodeID u) {
+    partition[u] = p_graph.block(u);
+  });
 
   // now append the isolated ones
   const BlockID k = p_graph.k();
@@ -148,48 +384,4 @@ PartitionedGraph assign_isolated_nodes(
   return {graph, k, std::move(partition)};
 }
 
-Graph rearrange_by_degree_buckets(Context &ctx, Graph old_graph) {
-  auto nodes = old_graph.take_raw_nodes();
-  auto edges = old_graph.take_raw_edges();
-  auto node_weights = old_graph.take_raw_node_weights();
-  auto edge_weights = old_graph.take_raw_edge_weights();
-
-  auto node_permutations =
-      graph::rearrange_graph(ctx.partition, nodes, edges, node_weights, edge_weights);
-
-  KASSERT(
-      [&] {
-        if (!node_weights.empty() && node_weights.size() + 1 < nodes.size()) {
-          LOG_WARNING << "node weights array is not empty, but smaller than the number of nodes";
-          return false;
-        }
-        if (!edge_weights.empty() && edge_weights.size() < edges.size()) {
-          LOG_WARNING << "edge weights array is not empty, but smaller than the number of edges";
-          return false;
-        }
-        for (NodeID u = 0; u + 1 < nodes.size(); ++u) {
-          if (nodes[u] > nodes[u + 1] || nodes[u + 1] > edges.size()) {
-            LOG_WARNING << "invalid nodes[] entry for node " << u;
-            return false;
-          }
-          for (EdgeID e = nodes[u]; e < nodes[u + 1]; ++e) {
-            const NodeID v = edges[e];
-            if (v + 1 > nodes.size()) {
-              LOG_WARNING << "neighbor " << v << " of node " << u << " is out of range";
-              return false;
-            }
-          }
-        }
-        return true;
-      }(),
-      "graph permutation produced invalid CSR graph",
-      assert::heavy
-  );
-
-  Graph new_graph(
-      std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), true
-  );
-  new_graph.set_permutation(std::move(node_permutations.old_to_new));
-  return new_graph;
-}
 } // namespace kaminpar::shm::graph
diff --git a/kaminpar-shm/graphutils/permutator.h b/kaminpar-shm/graphutils/permutator.h
index 584f2607..b641d302 100644
--- a/kaminpar-shm/graphutils/permutator.h
+++ b/kaminpar-shm/graphutils/permutator.h
@@ -9,7 +9,6 @@
 #include <array>
 #include <utility>
 
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
 #include "kaminpar-shm/kaminpar.h"
@@ -46,8 +45,8 @@ NodePermutations<StaticArray> sort_by_degree_buckets(const StaticArray<EdgeID> &
   const NodeID n = nodes.size() - 1;
   const int cpus = std::min<int>(tbb::this_task_arena::max_concurrency(), n);
 
-  StaticArray<NodeID> permutation(n);
-  StaticArray<NodeID> inverse_permutation(n);
+  RECORD("permutation") StaticArray<NodeID> permutation(n);
+  RECORD("inverse_permutation") StaticArray<NodeID> inverse_permutation(n);
 
   // local_buckets[cpu][bucket]: thread-local bucket sizes
   using Buckets = std::array<NodeID, kNumberOfDegreeBuckets<NodeID> + 1>;
@@ -90,7 +89,7 @@ NodePermutations<StaticArray> sort_by_degree_buckets(const StaticArray<EdgeID> &
   });
 
   // Compute inverse permutation
-  tbb::parallel_for(static_cast<std::size_t>(1), nodes.size(), [&](const NodeID u_plus_one) {
+  tbb::parallel_for<std::size_t>(1, nodes.size(), [&](const NodeID u_plus_one) {
     const NodeID u = u_plus_one - 1;
     inverse_permutation[permutation[u]] = u;
   });
@@ -153,7 +152,7 @@ void build_permuted_graph(
   parallel::prefix_sum(new_nodes.begin(), new_nodes.end(), new_nodes.begin());
 
   // Build p_edges, p_edge_weights
-  tbb::parallel_for(static_cast<GraphNodeID>(0), n, [&](const GraphNodeID u) {
+  tbb::parallel_for<GraphNodeID>(0, n, [&](const GraphNodeID u) {
     const NodeID old_u = permutations.new_to_old[u];
 
     for (auto e = old_nodes[old_u]; e < old_nodes[old_u + 1]; ++e) {
@@ -167,19 +166,58 @@ void build_permuted_graph(
   });
 }
 
-Graph rearrange_by_degree_buckets(Context &ctx, Graph graph);
-
 NodePermutations<StaticArray> rearrange_graph(
-    PartitionContext &p_ctx,
     StaticArray<EdgeID> &nodes,
     StaticArray<NodeID> &edges,
     StaticArray<NodeWeight> &node_weights,
     StaticArray<EdgeWeight> &edge_weights
 );
 
+/*!
+ * Rearranges the nodes of the graph such that nodes are sorted by their exponentially spaced degree
+ * buckets and the isolated nodes are moved to the back of the graph.
+ *
+ * @param graph The graph to rearrange.
+ * @return The rearranged graph.
+ */
+Graph rearrange_by_degree_buckets(CSRGraph &graph);
+
+/*!
+ * Rearrange the neighborhood of each node in a graph, so that the ordering is the same as in the
+ * compressed version of the graph.
+ *
+ * @param graph The graph to rearrange
+ */
+void reorder_edges_by_compression(CSRGraph &graph);
+
+/*!
+ * Removes the isolated nodes of a graph which are located at the back of the graph.
+ *
+ * @param graph The graph whose isolated nodes to remove.
+ * @param p_ctx The parition context to update.
+ */
+void remove_isolated_nodes(Graph &graph, PartitionContext &p_ctx);
+
+/*!
+ * Integrates the isolated nodes of a graph that have been removed.
+ *
+ * @param graph The graph whose isolated nodes to integrate.
+ * @param epsilon The epsilon value before removing the integrated nodes.
+ * @param ctx The context to update.
+ * @return The number of isolated nodes integrated.
+ */
 NodeID integrate_isolated_nodes(Graph &graph, double epsilon, Context &ctx);
 
+/*!
+ * Assignes isolated nodes to a partition.
+ *
+ * @param p_graph The partitioned graph whose isolated nodes to assign.
+ * @param num_isolated_nodes the number of isolated nodes.
+ * @param p_ctx The partition context of the graph.
+ * @return The updated partitioned graph.
+ */
 PartitionedGraph assign_isolated_nodes(
     PartitionedGraph p_graph, const NodeID num_isolated_nodes, const PartitionContext &p_ctx
 );
+
 } // namespace kaminpar::shm::graph
diff --git a/kaminpar-shm/graphutils/subgraph_extractor.cc b/kaminpar-shm/graphutils/subgraph_extractor.cc
index 98c90407..4daf57bb 100644
--- a/kaminpar-shm/graphutils/subgraph_extractor.cc
+++ b/kaminpar-shm/graphutils/subgraph_extractor.cc
@@ -6,8 +6,6 @@
  ******************************************************************************/
 #include "kaminpar-shm/graphutils/subgraph_extractor.h"
 
-#include <mutex>
-
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
 
@@ -20,16 +18,16 @@
 #include "kaminpar-common/datastructures/static_array.h"
 #include "kaminpar-common/logger.h"
 #include "kaminpar-common/parallel/algorithm.h"
-#include "kaminpar-common/parallel/atomic.h"
 #include "kaminpar-common/timer.h"
 
 namespace kaminpar::shm::graph {
 namespace {
 SET_DEBUG(false);
-}
 
-SequentialSubgraphExtractionResult extract_subgraphs_sequential(
+template <typename Graph>
+SequentialSubgraphExtractionResult extract_subgraphs_sequential_generic_graph(
     const PartitionedGraph &p_graph,
+    const Graph &graph,
     const std::array<BlockID, 2> &final_ks,
     const SubgraphMemoryStartPosition memory_position,
     SubgraphMemory &subgraph_memory,
@@ -37,11 +35,11 @@ SequentialSubgraphExtractionResult extract_subgraphs_sequential(
 ) {
   KASSERT(p_graph.k() == 2u, "Only suitable for bipartitions!", assert::light);
 
-  const bool is_node_weighted = p_graph.graph().node_weighted();
-  const bool is_edge_weighted = p_graph.graph().edge_weighted();
+  const bool is_node_weighted = graph.node_weighted();
+  const bool is_edge_weighted = graph.edge_weighted();
 
   const BlockID final_k = final_ks[0] + final_ks[1];
-  tmp_subgraph_memory.ensure_size_nodes(p_graph.n() + final_k, is_node_weighted);
+  tmp_subgraph_memory.ensure_size_nodes(graph.n() + final_k, is_node_weighted);
 
   auto &nodes = tmp_subgraph_memory.nodes;
   auto &edges = tmp_subgraph_memory.edges;
@@ -53,15 +51,15 @@ SequentialSubgraphExtractionResult extract_subgraphs_sequential(
   std::array<EdgeID, 2> s_m{0, 0};
 
   // find graph sizes
-  for (const NodeID u : p_graph.nodes()) {
+  for (const NodeID u : graph.nodes()) {
     const BlockID b = p_graph.block(u);
     tmp_subgraph_memory.mapping[u] = s_n[b]++;
 
-    for (const auto [e, v] : p_graph.neighbors(u)) {
+    graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
       if (p_graph.block(v) == b) {
         ++s_m[b];
       }
-    }
+    });
   }
 
   // start position of subgraph[1] in common memory ds
@@ -75,25 +73,25 @@ SequentialSubgraphExtractionResult extract_subgraphs_sequential(
   // build extract graphs in temporary memory buffer
   std::array<EdgeID, 2> next_edge_id{0, 0};
 
-  for (const NodeID u : p_graph.nodes()) {
+  for (const NodeID u : graph.nodes()) {
     const BlockID b = p_graph.block(u);
 
     const NodeID n0 = b * n1;
     const EdgeID m0 = b * m1; // either 0 or s_m[0]
 
-    for (const auto [e, v] : p_graph.neighbors(u)) {
+    graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
       if (p_graph.block(v) == b) {
         edges[m0 + next_edge_id[b]] = mapping[v];
         if (is_edge_weighted) {
-          edge_weights[m0 + next_edge_id[b]] = p_graph.edge_weight(e);
+          edge_weights[m0 + next_edge_id[b]] = graph.edge_weight(e);
         }
         ++next_edge_id[b];
       }
-    }
+    });
 
     nodes[n0 + mapping[u] + 1] = next_edge_id[b];
     if (is_node_weighted) {
-      node_weights[n0 + mapping[u]] = p_graph.node_weight(u);
+      node_weights[n0 + mapping[u]] = graph.node_weight(u);
     }
   }
 
@@ -101,7 +99,7 @@ SequentialSubgraphExtractionResult extract_subgraphs_sequential(
   // THIS OPERATION OVERWRITES p_graph!
   std::copy(
       nodes.begin(),
-      nodes.begin() + p_graph.n() + final_k,
+      nodes.begin() + graph.n() + final_k,
       subgraph_memory.nodes.begin() + memory_position.nodes_start_pos
   );
   std::copy(
@@ -112,7 +110,7 @@ SequentialSubgraphExtractionResult extract_subgraphs_sequential(
   if (is_node_weighted) {
     std::copy(
         node_weights.begin(),
-        node_weights.begin() + p_graph.n() + final_k,
+        node_weights.begin() + graph.n() + final_k,
         subgraph_memory.node_weights.begin() + memory_position.nodes_start_pos
     );
   }
@@ -143,38 +141,56 @@ SequentialSubgraphExtractionResult extract_subgraphs_sequential(
         is_edge_weighted * m,
         subgraph_memory.edge_weights
     );
-    return Graph(
-        Graph::seq{},
+    return shm::Graph(std::make_unique<CSRGraph>(
+        CSRGraph::seq{},
         std::move(s_nodes),
         std::move(s_edges),
         std::move(s_node_weights),
         std::move(s_edge_weights)
-    );
+    ));
   };
 
-  std::array<Graph, 2> subgraphs{
+  std::array<shm::Graph, 2> subgraphs{
       create_graph(0, s_n[0], 0, s_m[0]), create_graph(n1, s_n[1], m1, s_m[1])
   };
 
   return {std::move(subgraphs), std::move(subgraph_positions)};
 }
+} // namespace
+
+SequentialSubgraphExtractionResult extract_subgraphs_sequential(
+    const PartitionedGraph &p_graph,
+    const std::array<BlockID, 2> &final_ks,
+    const SubgraphMemoryStartPosition memory_position,
+    SubgraphMemory &subgraph_memory,
+    TemporarySubgraphMemory &tmp_subgraph_memory
+) {
+  return p_graph.reified([&](const auto &graph) {
+    return extract_subgraphs_sequential_generic_graph(
+        p_graph, graph, final_ks, memory_position, subgraph_memory, tmp_subgraph_memory
+    );
+  });
+}
 
+namespace {
 /*
  * Builds a block-induced subgraph for each block of a partitioned graph. Return
  * type contains a mapping that maps nodes from p_graph to nodes in the
  * respective subgraph; we need this because the order in which nodes in
  * subgraphs appear is non-deterministic due to parallelization.
  */
-SubgraphExtractionResult extract_subgraphs(
-    const PartitionedGraph &p_graph, const BlockID input_k, SubgraphMemory &subgraph_memory
+template <typename Graph>
+SubgraphExtractionResult extract_subgraphs_generic_graph(
+    const PartitionedGraph &p_graph,
+    const Graph &graph,
+    const BlockID input_k,
+    SubgraphMemory &subgraph_memory
 ) {
-  const auto &graph = p_graph.graph();
-
   START_TIMER("Allocation");
-  scalable_vector<NodeID> mapping(p_graph.n());
-  scalable_vector<SubgraphMemoryStartPosition> start_positions(p_graph.k() + 1);
-  std::vector<parallel::Atomic<NodeID>> bucket_index(p_graph.k());
-  scalable_vector<Graph> subgraphs(p_graph.k());
+  StaticArray<NodeID> mapping(p_graph.n());
+  StaticArray<SubgraphMemoryStartPosition> start_positions(p_graph.k() + 1);
+  StaticArray<NodeID> bucket_index(p_graph.k());
+  scalable_vector<shm::Graph> subgraphs(p_graph.k());
   STOP_TIMER();
 
   // count number of nodes and edges in each block
@@ -193,17 +209,17 @@ SubgraphExtractionResult extract_subgraphs(
     for (NodeID u = r.begin(); u != r.end(); ++u) {
       const BlockID u_block = p_graph.block(u);
       ++num_nodes_in_block[u_block];
-      for (const NodeID v : graph.adjacent_nodes(u)) {
+      graph.adjacent_nodes(u, [&](const NodeID v) {
         if (p_graph.block(v) == u_block) {
           ++num_edges_in_block[u_block];
         }
-      }
+      });
     }
   });
   STOP_TIMER();
 
   START_TIMER("Merge block sizes");
-  tbb::parallel_for(static_cast<BlockID>(0), p_graph.k(), [&](const BlockID b) {
+  tbb::parallel_for<BlockID>(0, p_graph.k(), [&](const BlockID b) {
     NodeID num_nodes =
         compute_final_k(b, p_graph.k(), input_k); // padding for sequential subgraph extraction
     EdgeID num_edges = 0;
@@ -221,9 +237,9 @@ SubgraphExtractionResult extract_subgraphs(
 
   // build temporary bucket array in nodes array
   START_TIMER("Build bucket array");
-  tbb::parallel_for(static_cast<NodeID>(0), p_graph.n(), [&](const NodeID u) {
+  tbb::parallel_for<NodeID>(0, p_graph.n(), [&](const NodeID u) {
     const BlockID b = p_graph.block(u);
-    const NodeID pos_in_subgraph = bucket_index[b]++;
+    const NodeID pos_in_subgraph = __atomic_fetch_add(&bucket_index[b], 1, __ATOMIC_RELAXED);
     const NodeID pos = start_positions[b].nodes_start_pos + pos_in_subgraph;
     subgraph_memory.nodes[pos] = u;
     mapping[u] = pos_in_subgraph; // concurrent random access write
@@ -235,7 +251,7 @@ SubgraphExtractionResult extract_subgraphs(
 
   // build graph
   START_TIMER("Construct subgraphs");
-  tbb::parallel_for(static_cast<BlockID>(0), p_graph.k(), [&](const BlockID b) {
+  tbb::parallel_for<BlockID>(0, p_graph.k(), [&](const BlockID b) {
     const NodeID nodes_start_pos = start_positions[b].nodes_start_pos;
     EdgeID e = 0;                                  // edge = in subgraph
     for (NodeID u = 0; u < bucket_index[b]; ++u) { // u = in subgraph
@@ -248,16 +264,18 @@ SubgraphExtractionResult extract_subgraphs(
 
       const EdgeID e0 = start_positions[b].edges_start_pos;
 
-      for (const auto [e_prime, v_prime] :
-           graph.neighbors(u_prime)) {     // e_prime, v_prime = in graph
-        if (p_graph.block(v_prime) == b) { // only keep internal edges
-          if (is_edge_weighted) {
-            subgraph_memory.edge_weights[e0 + e] = graph.edge_weight(e_prime);
+      graph.neighbors(
+          u_prime,
+          [&](const EdgeID e_prime, const NodeID v_prime) { // e_prime, v_prime = in graph
+            if (p_graph.block(v_prime) == b) {              // only keep internal edges
+              if (is_edge_weighted) {
+                subgraph_memory.edge_weights[e0 + e] = graph.edge_weight(e_prime);
+              }
+              subgraph_memory.edges[e0 + e] = mapping[v_prime];
+              ++e;
+            }
           }
-          subgraph_memory.edges[e0 + e] = mapping[v_prime];
-          ++e;
-        }
-      }
+      );
     }
 
     subgraph_memory.nodes[nodes_start_pos + bucket_index[b]] = e;
@@ -265,7 +283,7 @@ SubgraphExtractionResult extract_subgraphs(
   STOP_TIMER();
 
   START_TIMER("Create graph objects");
-  tbb::parallel_for(static_cast<BlockID>(0), p_graph.k(), [&](const BlockID b) {
+  tbb::parallel_for<BlockID>(0, p_graph.k(), [&](const BlockID b) {
     const NodeID n0 = start_positions[b].nodes_start_pos;
     const EdgeID m0 = start_positions[b].edges_start_pos;
 
@@ -281,16 +299,20 @@ SubgraphExtractionResult extract_subgraphs(
     StaticArray<EdgeWeight> edge_weights(
         is_edge_weighted * m0, is_edge_weighted * m, subgraph_memory.edge_weights
     );
-    subgraphs[b] =
-        Graph{std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights)};
+    subgraphs[b] = shm::Graph(std::make_unique<CSRGraph>(
+        std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights)
+    ));
   });
   STOP_TIMER();
 
   KASSERT(
       [&] {
         for (const BlockID b : p_graph.blocks()) {
-          if (!debug::validate_graph(subgraphs[b])) {
-            return false;
+          if (auto *csr_graph = dynamic_cast<CSRGraph *>(subgraphs[b].underlying_graph());
+              csr_graph != nullptr) {
+            if (!debug::validate_graph(*csr_graph)) {
+              return false;
+            }
           }
         }
         return true;
@@ -301,13 +323,22 @@ SubgraphExtractionResult extract_subgraphs(
 
   return {std::move(subgraphs), std::move(mapping), std::move(start_positions)};
 }
+} // namespace
+
+SubgraphExtractionResult extract_subgraphs(
+    const PartitionedGraph &p_graph, const BlockID input_k, SubgraphMemory &subgraph_memory
+) {
+  return p_graph.reified([&](const auto &concrete_graph) {
+    return extract_subgraphs_generic_graph(p_graph, concrete_graph, input_k, subgraph_memory);
+  });
+}
 
 PartitionedGraph copy_subgraph_partitions(
     PartitionedGraph p_graph,
     const scalable_vector<StaticArray<BlockID>> &p_subgraph_partitions,
     const BlockID k_prime,
     const BlockID input_k,
-    const scalable_vector<NodeID> &mapping
+    const StaticArray<NodeID> &mapping
 ) {
   // The offset calculation works as follows:
   //
@@ -326,12 +357,12 @@ PartitionedGraph copy_subgraph_partitions(
   k0.front() = 0;
   parallel::prefix_sum(k0.begin(), k0.end(), k0.begin());
 
-  DBG << "Copying resulting partition after recursive bipartitioning: extended " << p_graph.k()
-      << "-way partition to " << k_prime << "-way, goal: " << input_k;
-  DBG << "Block offsets: " << k0;
+  DBG << "Copying partition after recursive bipartitioning: extended " << p_graph.k()
+      << "-way partition to " << k_prime << "-way, goal: " << input_k
+      << " with block offsets: " << k0;
 
   StaticArray<BlockID> partition = p_graph.take_raw_partition();
-  tbb::parallel_for<NodeID>(0, p_graph.n(), [&](const NodeID u) {
+  p_graph.pfor_nodes([&](const NodeID u) {
     const BlockID b = partition[u];
     const NodeID s_u = mapping[u];
     partition[u] = k0[b] + p_subgraph_partitions[b][s_u];
diff --git a/kaminpar-shm/graphutils/subgraph_extractor.h b/kaminpar-shm/graphutils/subgraph_extractor.h
index 5788cbd6..1b9f9a8e 100644
--- a/kaminpar-shm/graphutils/subgraph_extractor.h
+++ b/kaminpar-shm/graphutils/subgraph_extractor.h
@@ -14,12 +14,13 @@
 #include "kaminpar-shm/kaminpar.h"
 
 #include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/timer.h"
 
 namespace kaminpar::shm::graph {
 struct SubgraphMemoryStartPosition {
-  std::size_t nodes_start_pos{0};
-  std::size_t edges_start_pos{0};
+  std::size_t nodes_start_pos = 0;
+  std::size_t edges_start_pos = 0;
 
   // operator overloads for parallel::prefix_sum()
   SubgraphMemoryStartPosition operator+(const SubgraphMemoryStartPosition &other) const {
@@ -34,7 +35,9 @@ struct SubgraphMemoryStartPosition {
 };
 
 struct SubgraphMemory {
-  SubgraphMemory() = default;
+  SubgraphMemory() {
+    RECORD_DATA_STRUCT(0, _struct);
+  }
 
   SubgraphMemory(
       const NodeID n,
@@ -43,10 +46,12 @@ struct SubgraphMemory {
       const bool is_node_weighted = true,
       const bool is_edge_weighted = true
   ) {
+    RECORD_DATA_STRUCT(0, _struct);
     resize(n, k, m, is_node_weighted, is_edge_weighted);
   }
 
   explicit SubgraphMemory(const PartitionedGraph &p_graph) {
+    RECORD_DATA_STRUCT(0, _struct);
     resize(p_graph);
   }
 
@@ -67,12 +72,22 @@ struct SubgraphMemory {
       const bool is_node_weighted = true,
       const bool is_edge_weighted = true
   ) {
+    SCOPED_HEAP_PROFILER("SubgraphMemory resize");
     SCOPED_TIMER("Allocation");
 
     nodes.resize(n + k);
     edges.resize(m);
     node_weights.resize(is_node_weighted * (n + k));
     edge_weights.resize(is_edge_weighted * m);
+
+    IF_HEAP_PROFILING(
+        _struct->size = std::max(
+            _struct->size,
+            (n + k) * sizeof(EdgeID) + m * sizeof(NodeID) +
+                is_node_weighted * (n + k) * sizeof(NodeWeight) +
+                is_edge_weighted * m * sizeof(EdgeWeight)
+        )
+    );
   }
 
   [[nodiscard]] bool empty() const {
@@ -83,12 +98,14 @@ struct SubgraphMemory {
   StaticArray<NodeID> edges;
   StaticArray<NodeWeight> node_weights;
   StaticArray<EdgeWeight> edge_weights;
+
+  IF_HEAP_PROFILING(heap_profiler::DataStructure *_struct);
 };
 
 struct SubgraphExtractionResult {
   scalable_vector<Graph> subgraphs;
-  scalable_vector<NodeID> node_mapping;
-  scalable_vector<SubgraphMemoryStartPosition> positions;
+  StaticArray<NodeID> node_mapping;
+  StaticArray<SubgraphMemoryStartPosition> positions;
 };
 
 struct SequentialSubgraphExtractionResult {
@@ -141,7 +158,7 @@ struct TemporarySubgraphMemory {
 };
 
 SubgraphExtractionResult extract_subgraphs(
-    const PartitionedGraph &p_graph, const BlockID input_k, SubgraphMemory &subgraph_memory
+    const PartitionedGraph &p_graph, BlockID input_k, SubgraphMemory &subgraph_memory
 );
 
 SequentialSubgraphExtractionResult extract_subgraphs_sequential(
@@ -157,6 +174,6 @@ PartitionedGraph copy_subgraph_partitions(
     const scalable_vector<StaticArray<BlockID>> &p_subgraph_partitions,
     BlockID k_prime,
     BlockID input_k,
-    const scalable_vector<NodeID> &mapping
+    const StaticArray<NodeID> &mapping
 );
 } // namespace kaminpar::shm::graph
diff --git a/kaminpar-shm/initial_partitioning/bfs_bipartitioner.cc b/kaminpar-shm/initial_partitioning/bfs_bipartitioner.cc
index 885d5549..d0999871 100644
--- a/kaminpar-shm/initial_partitioning/bfs_bipartitioner.cc
+++ b/kaminpar-shm/initial_partitioning/bfs_bipartitioner.cc
@@ -1,8 +1,9 @@
 /*******************************************************************************
+ * Initial partitioner based on breath-first searches.
+ *
  * @file:   bfs_bipartitioner.h
  * @author: Daniel Seemaier
  * @date:   21.09.2021
- * @brief:  Initial partitioner based on breath-first searches.
  ******************************************************************************/
 #include "kaminpar-shm/initial_partitioning/bfs_bipartitioner.h"
 
diff --git a/kaminpar-shm/initial_partitioning/bfs_bipartitioner.h b/kaminpar-shm/initial_partitioning/bfs_bipartitioner.h
index 45632909..55524e16 100644
--- a/kaminpar-shm/initial_partitioning/bfs_bipartitioner.h
+++ b/kaminpar-shm/initial_partitioning/bfs_bipartitioner.h
@@ -1,13 +1,15 @@
 /*******************************************************************************
+ * Initial partitioner based on breath-first searches.
+ *
  * @file:   bfs_bipartitioner.h
  * @author: Daniel Seemaier
  * @date:   21.09.2021
- * @brief:  Initial partitioner based on breath-first searches.
  ******************************************************************************/
 #pragma once
 
 #include <array>
 
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/initial_partitioning/bipartitioner.h"
 #include "kaminpar-shm/initial_partitioning/seed_node_utils.h"
 #include "kaminpar-shm/kaminpar.h"
@@ -79,7 +81,7 @@ class BfsBipartitionerBase : public Bipartitioner {
   };
 
   BfsBipartitionerBase(
-      const Graph &graph, const PartitionContext &p_ctx, const InitialPartitioningContext &i_ctx
+      const CSRGraph &graph, const PartitionContext &p_ctx, const InitialPartitioningContext &i_ctx
   )
       : Bipartitioner(graph, p_ctx, i_ctx) {}
 };
@@ -103,14 +105,14 @@ class BfsBipartitioner : public BfsBipartitionerBase {
 
 public:
   BfsBipartitioner(
-      const Graph &graph,
+      const CSRGraph &graph,
       const PartitionContext &p_ctx,
       const InitialPartitioningContext &i_ctx,
       MemoryContext &m_ctx
   )
       : BfsBipartitionerBase(graph, p_ctx, i_ctx),
-        _queues{m_ctx.queues},
-        _marker{m_ctx.marker},
+        _queues(m_ctx.queues),
+        _marker(m_ctx.marker),
         _num_seed_iterations(i_ctx.num_seed_iterations) {
     if (_marker.capacity() < _graph.n()) {
       _marker.resize(_graph.n());
diff --git a/kaminpar-shm/initial_partitioning/bipartitioner.h b/kaminpar-shm/initial_partitioning/bipartitioner.h
index c5b3e143..271555ca 100644
--- a/kaminpar-shm/initial_partitioning/bipartitioner.h
+++ b/kaminpar-shm/initial_partitioning/bipartitioner.h
@@ -7,11 +7,10 @@
 #pragma once
 
 #include <array>
-#include <tuple>
 
-#include "kaminpar-shm/context.h"
-#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
+#include "kaminpar-shm/kaminpar.h"
 
 #include "kaminpar-common/assert.h"
 #include "kaminpar-common/datastructures/static_array.h"
@@ -23,19 +22,19 @@ class Bipartitioner {
 
   Bipartitioner(const Bipartitioner &) = delete;
   Bipartitioner &operator=(Bipartitioner &&) = delete;
+
   Bipartitioner(Bipartitioner &&) noexcept = default;
   Bipartitioner &operator=(const Bipartitioner &) = delete;
+
   virtual ~Bipartitioner() = default;
 
   //! Compute bipartition and return as partitioned graph.
-  virtual PartitionedGraph bipartition(StaticArray<BlockID> &&partition = {}) {
-    return PartitionedGraph(
-        PartitionedGraph::seq{}, _graph, 2, bipartition_raw(std::move(partition))
-    );
+  virtual PartitionedCSRGraph bipartition(StaticArray<BlockID> partition = {}) {
+    return {PartitionedCSRGraph::seq{}, _graph, 2, bipartition_raw(std::move(partition))};
   }
 
   //! Compute bipartition and return as array.
-  StaticArray<BlockID> bipartition_raw(StaticArray<BlockID> &&partition = {}) {
+  StaticArray<BlockID> bipartition_raw(StaticArray<BlockID> partition = {}) {
     if (_graph.n() == 0) {
       return {};
     }
@@ -59,11 +58,11 @@ class Bipartitioner {
   static constexpr BlockID V2 = 1;
 
   Bipartitioner(
-      const Graph &graph, const PartitionContext &p_ctx, const InitialPartitioningContext &i_ctx
+      const CSRGraph &graph, const PartitionContext &p_ctx, const InitialPartitioningContext &i_ctx
   )
-      : _graph{graph},
-        _p_ctx{p_ctx},
-        _i_ctx{i_ctx} {
+      : _graph(graph),
+        _p_ctx(p_ctx),
+        _i_ctx(i_ctx) {
     KASSERT(_p_ctx.k == 2u, "not a bipartition context", assert::light);
   }
 
@@ -75,9 +74,9 @@ class Bipartitioner {
   //
 
   inline void add_to_smaller_block(const NodeID u) {
-    const NodeWeight delta1{_block_weights[0] - _p_ctx.block_weights.perfectly_balanced(0)};
-    const NodeWeight delta2{_block_weights[1] - _p_ctx.block_weights.perfectly_balanced(1)};
-    const BlockID block{delta1 < delta2 ? V1 : V2};
+    const NodeWeight delta1 = _block_weights[0] - _p_ctx.block_weights.perfectly_balanced(0);
+    const NodeWeight delta2 = _block_weights[1] - _p_ctx.block_weights.perfectly_balanced(1);
+    const BlockID block = delta1 < delta2 ? V1 : V2;
     set_block(u, block);
   }
 
@@ -100,7 +99,7 @@ class Bipartitioner {
     return 1 - b;
   }
 
-  const Graph &_graph;
+  const CSRGraph &_graph;
   const PartitionContext &_p_ctx;
   const InitialPartitioningContext &_i_ctx;
 
diff --git a/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.cc b/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.cc
index 9b7a78c8..dcff2417 100644
--- a/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.cc
+++ b/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.cc
@@ -6,7 +6,10 @@
  ******************************************************************************/
 #include "kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.h"
 
+#include <algorithm>
+
 #include "kaminpar-common/assert.h"
+#include "kaminpar-common/random.h"
 
 namespace kaminpar::shm::ip {
 void GreedyGraphGrowingBipartitioner::bipartition_impl() {
diff --git a/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.h b/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.h
index 53d4678f..c471245c 100644
--- a/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.h
+++ b/kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.h
@@ -6,11 +6,11 @@
  ******************************************************************************/
 #pragma once
 
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/initial_partitioning/bipartitioner.h"
 
 #include "kaminpar-common/datastructures/binary_heap.h"
 #include "kaminpar-common/datastructures/marker.h"
-#include "kaminpar-common/random.h"
 
 namespace kaminpar::shm::ip {
 class GreedyGraphGrowingBipartitioner : public Bipartitioner {
@@ -25,14 +25,14 @@ class GreedyGraphGrowingBipartitioner : public Bipartitioner {
   };
 
   GreedyGraphGrowingBipartitioner(
-      const Graph &graph,
+      const CSRGraph &graph,
       const PartitionContext &p_ctx,
       const InitialPartitioningContext &i_ctx,
       MemoryContext &m_ctx
   )
       : Bipartitioner(graph, p_ctx, i_ctx),
-        _queue{m_ctx.queue},
-        _marker{m_ctx.marker} {
+        _queue(m_ctx.queue),
+        _marker(m_ctx.marker) {
     if (_queue.capacity() < _graph.n()) {
       _queue.resize(_graph.n());
     }
diff --git a/kaminpar-shm/initial_partitioning/initial_coarsener.cc b/kaminpar-shm/initial_partitioning/initial_coarsener.cc
index 0a65ec88..3a3ba55b 100644
--- a/kaminpar-shm/initial_partitioning/initial_coarsener.cc
+++ b/kaminpar-shm/initial_partitioning/initial_coarsener.cc
@@ -9,11 +9,10 @@
 
 #include "kaminpar-common/assert.h"
 #include "kaminpar-common/logger.h"
-#include "kaminpar-common/timer.h"
 
 namespace kaminpar::shm::ip {
 InitialCoarsener::InitialCoarsener(
-    const Graph *graph, const InitialCoarseningContext &c_ctx, MemoryContext &&m_ctx
+    const CSRGraph *graph, const InitialCoarseningContext &c_ctx, MemoryContext &&m_ctx
 )
     : _input_graph(graph),
       _current_graph(graph),
@@ -45,10 +44,10 @@ InitialCoarsener::InitialCoarsener(
   }
 }
 
-InitialCoarsener::InitialCoarsener(const Graph *graph, const InitialCoarseningContext &c_ctx)
+InitialCoarsener::InitialCoarsener(const CSRGraph *graph, const InitialCoarseningContext &c_ctx)
     : InitialCoarsener(graph, c_ctx, MemoryContext{}) {}
 
-const Graph *
+const CSRGraph *
 InitialCoarsener::coarsen(const std::function<NodeWeight(NodeID)> &cb_max_cluster_weight) {
   const NodeWeight max_cluster_weight = cb_max_cluster_weight(_current_graph->n());
   if (!_precomputed_clustering) {
@@ -68,8 +67,8 @@ InitialCoarsener::coarsen(const std::function<NodeWeight(NodeID)> &cb_max_cluste
   return _current_graph;
 }
 
-PartitionedGraph InitialCoarsener::uncoarsen(PartitionedGraph &&c_p_graph) {
-  PartitionedGraph p_graph = _hierarchy.pop_and_project(std::move(c_p_graph));
+PartitionedCSRGraph InitialCoarsener::uncoarsen(PartitionedCSRGraph &&c_p_graph) {
+  PartitionedCSRGraph p_graph = _hierarchy.pop_and_project(std::move(c_p_graph));
   _current_graph = &_hierarchy.coarsest_graph();
   return p_graph;
 }
@@ -185,8 +184,8 @@ InitialCoarsener::ContractionResult InitialCoarsener::contract_current_clusterin
   node_mapping.resize(_current_graph->n());
   c_nodes.resize(c_n + 1);
   c_node_weights.resize(c_n);
-  c_edges.resize(_current_graph->m(), StaticArray<NodeID>::no_init{});            // overestimate
-  c_edge_weights.resize(_current_graph->m(), StaticArray<EdgeWeight>::no_init{}); // overestimate
+  c_edges.resize(_current_graph->m(), static_array::noinit);        // overestimate
+  c_edge_weights.resize(_current_graph->m(), static_array::noinit); // overestimate
 
   std::fill(_cluster_sizes.begin(), _cluster_sizes.end(), 0);
   std::fill(_leader_node_mapping.begin(), _leader_node_mapping.end(), 0);
@@ -303,8 +302,8 @@ InitialCoarsener::ContractionResult InitialCoarsener::contract_current_clusterin
     c_edge_weights.restrict(c_m);
   }
 
-  Graph coarse_graph(
-      Graph::seq{},
+  CSRGraph coarse_graph(
+      CSRGraph::seq{},
       std::move(c_nodes),
       std::move(c_edges),
       std::move(c_node_weights),
diff --git a/kaminpar-shm/initial_partitioning/initial_coarsener.h b/kaminpar-shm/initial_partitioning/initial_coarsener.h
index 7d046729..1e4afe7e 100644
--- a/kaminpar-shm/initial_partitioning/initial_coarsener.h
+++ b/kaminpar-shm/initial_partitioning/initial_coarsener.h
@@ -10,8 +10,7 @@
 #include <functional>
 #include <utility>
 
-#include "kaminpar-shm/context.h"
-#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
 #include "kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.h"
 #include "kaminpar-shm/kaminpar.h"
@@ -30,7 +29,7 @@ class InitialCoarsener {
   static constexpr auto kChunkSize = 256;
   static constexpr auto kNumberOfNodePermutations = 16;
 
-  using ContractionResult = std::pair<Graph, std::vector<NodeID>>;
+  using ContractionResult = std::pair<CSRGraph, std::vector<NodeID>>;
 
 public:
   struct Cluster {
@@ -62,10 +61,10 @@ class InitialCoarsener {
   };
 
   InitialCoarsener(
-      const Graph *graph, const InitialCoarseningContext &c_ctx, MemoryContext &&m_ctx
+      const CSRGraph *graph, const InitialCoarseningContext &c_ctx, MemoryContext &&m_ctx
   );
 
-  InitialCoarsener(const Graph *graph, const InitialCoarseningContext &c_ctx);
+  InitialCoarsener(const CSRGraph *graph, const InitialCoarseningContext &c_ctx);
 
   InitialCoarsener(const InitialCoarsener &) = delete;
   InitialCoarsener &operator=(const InitialCoarsener &) = delete;
@@ -81,13 +80,13 @@ class InitialCoarsener {
     return size() == 0;
   }
 
-  [[nodiscard]] inline const Graph *coarsest_graph() const {
+  [[nodiscard]] inline const CSRGraph *coarsest_graph() const {
     return &_hierarchy.coarsest_graph();
   }
 
-  const Graph *coarsen(const std::function<NodeWeight(NodeID)> &cb_max_cluster_weight);
+  const CSRGraph *coarsen(const std::function<NodeWeight(NodeID)> &cb_max_cluster_weight);
 
-  PartitionedGraph uncoarsen(PartitionedGraph &&c_p_graph);
+  PartitionedCSRGraph uncoarsen(PartitionedCSRGraph &&c_p_graph);
 
   MemoryContext free();
 
@@ -162,8 +161,8 @@ class InitialCoarsener {
     }
   }
 
-  const Graph *_input_graph;
-  const Graph *_current_graph;
+  const CSRGraph *_input_graph;
+  const CSRGraph *_current_graph;
   SequentialGraphHierarchy _hierarchy;
 
   const InitialCoarseningContext &_c_ctx;
diff --git a/kaminpar-shm/initial_partitioning/initial_partitioning_facade.h b/kaminpar-shm/initial_partitioning/initial_partitioning_facade.h
index 886e26c2..cd01ab58 100644
--- a/kaminpar-shm/initial_partitioning/initial_partitioning_facade.h
+++ b/kaminpar-shm/initial_partitioning/initial_partitioning_facade.h
@@ -8,6 +8,7 @@
 
 #include <memory>
 
+#include "kaminpar-shm/coarsening/max_cluster_weights.h"
 #include "kaminpar-shm/initial_partitioning/initial_coarsener.h"
 #include "kaminpar-shm/initial_partitioning/initial_refiner.h"
 #include "kaminpar-shm/initial_partitioning/pool_bipartitioner.h"
@@ -38,7 +39,7 @@ class InitialPartitioner {
   };
 
   InitialPartitioner(
-      const Graph &graph, const Context &ctx, const BlockID final_k, MemoryContext m_ctx = {}
+      const CSRGraph &graph, const Context &ctx, const BlockID final_k, MemoryContext m_ctx = {}
   )
       : _m_ctx(std::move(m_ctx)),
         _graph(graph),
@@ -63,19 +64,19 @@ class InitialPartitioner {
     return std::move(_m_ctx);
   }
 
-  PartitionedGraph partition() {
-    const Graph *c_graph = coarsen();
+  PartitionedCSRGraph partition() {
+    const CSRGraph *c_graph = coarsen();
 
     DBG << "Calling bipartitioner on coarsest graph with n=" << c_graph->n()
         << " m=" << c_graph->m();
     ip::PoolBipartitionerFactory factory;
     auto bipartitioner = factory.create(*c_graph, _p_ctx, _i_ctx, std::move(_m_ctx.pool_m_ctx));
     bipartitioner->set_num_repetitions(_num_bipartition_repetitions);
-    PartitionedGraph p_graph = bipartitioner->bipartition();
+    PartitionedCSRGraph p_graph = bipartitioner->bipartition();
     _m_ctx.pool_m_ctx = bipartitioner->free();
 
     DBG << "Bipartitioner result: "                              //
-        << "cut=" << metrics::edge_cut_seq(p_graph) << " " //
+        << "cut=" << metrics::edge_cut_seq(p_graph) << " "       //
         << "imbalance=" << metrics::imbalance(p_graph) << " "    //
         << "feasible=" << metrics::is_feasible(p_graph, _p_ctx); //
 
@@ -83,12 +84,13 @@ class InitialPartitioner {
   }
 
 private:
-  const Graph *coarsen() {
+  const CSRGraph *coarsen() {
     const InitialCoarseningContext &c_ctx = _i_ctx.coarsening;
-    const NodeWeight max_cluster_weight =
-        compute_max_cluster_weight(_i_ctx.coarsening, _graph, _p_ctx);
+    const NodeWeight max_cluster_weight = compute_max_cluster_weight<NodeWeight>(
+        _i_ctx.coarsening, _p_ctx, _graph.n(), _graph.total_node_weight()
+    );
 
-    const Graph *c_graph = &_graph;
+    const CSRGraph *c_graph = &_graph;
     bool shrunk = true;
     DBG << "Coarsen: n=" << c_graph->n() << " m=" << c_graph->m();
 
@@ -110,7 +112,7 @@ class InitialPartitioner {
     return c_graph;
   }
 
-  PartitionedGraph uncoarsen(PartitionedGraph p_graph) {
+  PartitionedCSRGraph uncoarsen(PartitionedCSRGraph p_graph) {
     DBG << "Uncoarsen: n=" << p_graph.n() << " m=" << p_graph.m();
 
     while (!_coarsener.empty()) {
@@ -121,7 +123,7 @@ class InitialPartitioner {
       DBG << "-> "                                                 //
           << "n=" << p_graph.n() << " "                            //
           << "m=" << p_graph.m() << " "                            //
-          << "cut=" << metrics::edge_cut_seq(p_graph) << " " //
+          << "cut=" << metrics::edge_cut_seq(p_graph) << " "       //
           << "imbalance=" << metrics::imbalance(p_graph) << " "    //
           << "feasible=" << metrics::is_feasible(p_graph, _p_ctx); //
     }
@@ -130,7 +132,7 @@ class InitialPartitioner {
   }
 
   MemoryContext _m_ctx;
-  const Graph &_graph;
+  const CSRGraph &_graph;
   const InitialPartitioningContext &_i_ctx;
   PartitionContext _p_ctx;
   ip::InitialCoarsener _coarsener;
diff --git a/kaminpar-shm/initial_partitioning/initial_refiner.h b/kaminpar-shm/initial_partitioning/initial_refiner.h
index 244f1da2..c0b9d6fb 100644
--- a/kaminpar-shm/initial_partitioning/initial_refiner.h
+++ b/kaminpar-shm/initial_partitioning/initial_refiner.h
@@ -9,8 +9,7 @@
 
 #include <algorithm>
 
-#include "kaminpar-shm/context.h"
-#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
 #include "kaminpar-shm/kaminpar.h"
 #include "kaminpar-shm/metrics.h"
@@ -55,9 +54,8 @@ class InitialRefiner {
 
   virtual ~InitialRefiner() = default;
 
-  virtual void initialize(const Graph &graph) = 0;
-
-  virtual bool refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) = 0;
+  virtual void initialize(const CSRGraph &graph) = 0;
+  virtual bool refine(PartitionedCSRGraph &p_graph, const PartitionContext &p_ctx) = 0;
 
   virtual MemoryContext free() = 0;
 };
@@ -66,9 +64,9 @@ class InitialNoopRefiner : public InitialRefiner {
 public:
   explicit InitialNoopRefiner(MemoryContext m_ctx) : _m_ctx{std::move(m_ctx)} {}
 
-  void initialize(const Graph &) final {}
+  void initialize(const CSRGraph &) final {}
 
-  bool refine(PartitionedGraph &, const PartitionContext &) final {
+  bool refine(PartitionedCSRGraph &, const PartitionContext &) final {
     return false;
   }
 
@@ -82,7 +80,7 @@ class InitialNoopRefiner : public InitialRefiner {
 
 namespace fm {
 struct SimpleStoppingPolicy {
-  void init(const Graph *) const {}
+  void init(const CSRGraph *) const {}
   [[nodiscard]] bool should_stop(const InitialRefinementContext &fm_ctx) const {
     return _num_steps > fm_ctx.num_fruitless_moves;
   }
@@ -94,14 +92,14 @@ struct SimpleStoppingPolicy {
   }
 
 private:
-  std::size_t _num_steps{0};
+  std::size_t _num_steps = 0;
 };
 
 // "Adaptive" random walk stopping policy
 // Implementation copied from: KaHyPar -> AdvancedRandomWalkModelStopsSearch,
 // Copyright (C) Sebastian Schlag
 struct AdaptiveStoppingPolicy {
-  void init(const Graph *graph) {
+  void init(const CSRGraph *graph) {
     _beta = std::sqrt(graph->n());
   }
 
@@ -136,20 +134,23 @@ struct AdaptiveStoppingPolicy {
   }
 
 private:
-  double _beta{};
-  std::size_t _num_steps{0};
-  double _variance{0.0};
-  double _Mk{0.0};
-  double _MkMinus1{0.0};
-  double _Sk{0.0};
-  double _SkMinus1{0.0};
+  double _beta = 0.0;
+  std::size_t _num_steps = 0;
+  double _variance = 0.0;
+  double _Mk = 0.0;
+  double _MkMinus1 = 0.0;
+  double _Sk = 0.0;
+  double _SkMinus1 = 0.0;
 };
 
 //! Always move the next node from the heavier block. This should improve
 //! balance.
 struct MaxWeightSelectionPolicy {
   std::size_t operator()(
-      const PartitionedGraph &p_graph, const PartitionContext &context, const Queues &, Random &rand
+      const PartitionedCSRGraph &p_graph,
+      const PartitionContext &context,
+      const Queues &,
+      Random &rand
   ) {
     const auto weight0 = p_graph.block_weight(0) - context.block_weights.perfectly_balanced(0);
     const auto weight1 = p_graph.block_weight(1) - context.block_weights.perfectly_balanced(1);
@@ -160,7 +161,7 @@ struct MaxWeightSelectionPolicy {
 //! Always select the node with the highest gain / lowest loss.
 struct MaxGainSelectionPolicy {
   std::size_t operator()(
-      const PartitionedGraph &p_graph,
+      const PartitionedCSRGraph &p_graph,
       const PartitionContext &context,
       const Queues &queues,
       Random &rand
@@ -178,7 +179,7 @@ struct MaxGainSelectionPolicy {
 
 struct MaxOverloadSelectionPolicy {
   std::size_t operator()(
-      const PartitionedGraph &p_graph,
+      const PartitionedCSRGraph &p_graph,
       const PartitionContext &context,
       const Queues &queues,
       Random &rand
@@ -198,7 +199,7 @@ struct MaxOverloadSelectionPolicy {
 //! cut is not balanced.
 struct BalancedMinCutAcceptancePolicy {
   bool operator()(
-      const PartitionedGraph &,
+      const PartitionedCSRGraph &,
       const PartitionContext &,
       const EdgeWeight accepted_overload,
       const EdgeWeight current_overload,
@@ -236,7 +237,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
   )
       : _p_ctx(p_ctx),
         _r_ctx(r_ctx),
-        _queues(std::move(m_ctx.queues)), //
+        _queues(std::move(m_ctx.queues)),
         _marker(std::move(m_ctx.marker)),
         _weighted_degrees(std::move(m_ctx.weighted_degrees)) {
     KASSERT(
@@ -258,7 +259,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
     }
   }
 
-  void initialize(const Graph &graph) final {
+  void initialize(const CSRGraph &graph) final {
     KASSERT(_queues[0].capacity() >= graph.n());
     KASSERT(_queues[1].capacity() >= graph.n());
     KASSERT(_marker.capacity() >= graph.n());
@@ -270,7 +271,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
     init_weighted_degrees();
   }
 
-  bool refine(PartitionedGraph &p_graph, const PartitionContext &) final {
+  bool refine(PartitionedCSRGraph &p_graph, const PartitionContext &) final {
     KASSERT(&p_graph.graph() == _graph, "must be initialized with the same graph", assert::light);
     KASSERT(
         p_graph.k() == 2u,
@@ -318,7 +319,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
    * @param p_graph Partition of #_graph.
    * @return Whether we were able to improve the cut.
    */
-  EdgeWeight round(PartitionedGraph &p_graph) {
+  EdgeWeight round(PartitionedCSRGraph &p_graph) {
     KASSERT(p_graph.k() == 2u, "2-way FM with " << p_graph.k() << "-way partition", assert::light);
     DBG << "Initial refiner initialized with n=" << p_graph.n() << " m=" << p_graph.m();
 
@@ -435,7 +436,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
     return accepted_delta;
   }
 
-  void init_pq(const PartitionedGraph &p_graph) {
+  void init_pq(const PartitionedCSRGraph &p_graph) {
     KASSERT(_queues[0].empty());
     KASSERT(_queues[1].empty());
 
@@ -463,7 +464,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
 #endif
   }
 
-  void insert_node(const PartitionedGraph &p_graph, const NodeID u) {
+  void insert_node(const PartitionedCSRGraph &p_graph, const NodeID u) {
     const EdgeWeight gain = compute_gain_from_scratch(p_graph, u);
     const BlockID u_block = p_graph.block(u);
     if (_weighted_degrees[u] != gain) {
@@ -471,7 +472,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
     }
   }
 
-  EdgeWeight compute_gain_from_scratch(const PartitionedGraph &p_graph, const NodeID u) {
+  EdgeWeight compute_gain_from_scratch(const PartitionedCSRGraph &p_graph, const NodeID u) {
     const BlockID u_block = p_graph.block(u);
     EdgeWeight weighted_external_degree = 0;
     for (const auto [e, v] : p_graph.neighbors(u)) {
@@ -491,7 +492,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
     }
   }
 
-  bool is_boundary_node(const PartitionedGraph &p_graph, const NodeID u) {
+  bool is_boundary_node(const PartitionedCSRGraph &p_graph, const NodeID u) {
     for (const NodeID v : p_graph.adjacent_nodes(u)) {
       if (p_graph.block(u) != p_graph.block(v)) {
         return true;
@@ -500,7 +501,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
     return false;
   }
 
-  void validate_pqs(const PartitionedGraph &p_graph) {
+  void validate_pqs(const PartitionedCSRGraph &p_graph) {
     for (const NodeID u : p_graph.nodes()) {
       if (is_boundary_node(p_graph, u)) {
         if (_marker.get(u)) {
@@ -518,8 +519,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
     }
   }
 
-  const Graph *_graph; //! Graph for refinement, partition to refine is passed
-                       //! to #refine().
+  const CSRGraph *_graph;
   const PartitionContext &_p_ctx;
   const InitialRefinementContext &_r_ctx;
   Queues _queues{BinaryMinHeap<EdgeWeight>{0}, BinaryMinHeap<EdgeWeight>{0}};
@@ -528,7 +528,7 @@ class InitialTwoWayFMRefiner : public InitialRefiner {
   QueueSelectionPolicy _queue_selection_policy{};
   CutAcceptancePolicy _cut_acceptance_policy{};
   StoppingPolicy _stopping_policy{};
-  Random &_rand{Random::instance()};
+  Random &_rand = Random::instance();
   RandomPermutations<NodeID, kChunkSize, kNumberOfNodePermutations> _permutations;
 };
 
@@ -551,7 +551,7 @@ using InitialAdaptive2WayFM = InitialTwoWayFMRefiner<
     fm::AdaptiveStoppingPolicy>;
 
 inline std::unique_ptr<InitialRefiner> create_initial_refiner(
-    const Graph &graph,
+    const CSRGraph &graph,
     const PartitionContext &p_ctx,
     const InitialRefinementContext &r_ctx,
     InitialRefiner::MemoryContext m_ctx
diff --git a/kaminpar-shm/initial_partitioning/pool_bipartitioner.h b/kaminpar-shm/initial_partitioning/pool_bipartitioner.h
index 82f58138..3ea4ef0a 100644
--- a/kaminpar-shm/initial_partitioning/pool_bipartitioner.h
+++ b/kaminpar-shm/initial_partitioning/pool_bipartitioner.h
@@ -11,7 +11,7 @@
 
 #include <memory>
 
-#include "kaminpar-shm/factories.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/initial_partitioning/bfs_bipartitioner.h"
 #include "kaminpar-shm/initial_partitioning/bipartitioner.h"
 #include "kaminpar-shm/initial_partitioning/greedy_graph_growing_bipartitioner.h"
@@ -89,7 +89,7 @@ class PoolBipartitioner {
   };
 
   PoolBipartitioner(
-      const Graph &graph,
+      const CSRGraph &graph,
       const PartitionContext &p_ctx,
       const InitialPartitioningContext &i_ctx,
       MemoryContext m_ctx = {}
@@ -140,7 +140,7 @@ class PoolBipartitioner {
     _best_partition = StaticArray<BlockID>(_graph.n());
   }
 
-  PartitionedGraph bipartition() {
+  PartitionedCSRGraph bipartition() {
     KASSERT(_current_partition.size() >= _graph.n());
     KASSERT(_best_partition.size() >= _graph.n());
 
@@ -221,7 +221,7 @@ class PoolBipartitioner {
   void run_bipartitioner(const std::size_t i) {
     DBG << "Running bipartitioner " << _bipartitioner_names[i] << " on graph with n=" << _graph.n()
         << " m=" << _graph.m();
-    PartitionedGraph p_graph = _bipartitioners[i]->bipartition(std::move(_current_partition));
+    PartitionedCSRGraph p_graph = _bipartitioners[i]->bipartition(std::move(_current_partition));
     DBG << " -> running refiner ...";
     _refiner->refine(p_graph, _p_ctx);
     DBG << " -> cut=" << metrics::edge_cut(p_graph) << " imbalance=" << metrics::imbalance(p_graph);
@@ -252,7 +252,7 @@ class PoolBipartitioner {
     }
   }
 
-  const Graph &_graph;
+  const CSRGraph &_graph;
   const PartitionContext &_p_ctx;
   const InitialPartitioningContext &_i_ctx;
   std::size_t _min_num_repetitions;
@@ -262,11 +262,11 @@ class PoolBipartitioner {
 
   MemoryContext _m_ctx{};
 
-  StaticArray<BlockID> _best_partition{_graph.n()};
-  EdgeWeight _best_cut{std::numeric_limits<EdgeWeight>::max()};
-  bool _best_feasible{false};
-  double _best_imbalance{0.0};
-  std::size_t _best_bipartitioner{0};
+  StaticArray<BlockID> _best_partition = _graph.n();
+  EdgeWeight _best_cut = std::numeric_limits<EdgeWeight>::max();
+  bool _best_feasible = false;
+  double _best_imbalance = 0.0;
+  std::size_t _best_bipartitioner = 0;
   StaticArray<BlockID> _current_partition{_graph.n()};
 
   std::vector<std::string> _bipartitioner_names{};
@@ -286,7 +286,7 @@ class PoolBipartitioner {
 class PoolBipartitionerFactory {
 public:
   std::unique_ptr<PoolBipartitioner> create(
-      const Graph &graph,
+      const CSRGraph &graph,
       const PartitionContext &p_ctx,
       const InitialPartitioningContext &i_ctx,
       PoolBipartitioner::MemoryContext m_ctx = {}
diff --git a/kaminpar-shm/initial_partitioning/random_bipartitioner.h b/kaminpar-shm/initial_partitioning/random_bipartitioner.h
index eafdba56..13cfe944 100644
--- a/kaminpar-shm/initial_partitioning/random_bipartitioner.h
+++ b/kaminpar-shm/initial_partitioning/random_bipartitioner.h
@@ -6,8 +6,7 @@
  ******************************************************************************/
 #pragma once
 
-#include <array>
-
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/initial_partitioning/bipartitioner.h"
 
 #include "kaminpar-common/random.h"
@@ -21,7 +20,7 @@ class RandomBipartitioner : public Bipartitioner {
     }
   };
 
-  RandomBipartitioner(const Graph &graph, const PartitionContext &p_ctx, const InitialPartitioningContext &i_ctx, MemoryContext &)
+  RandomBipartitioner(const CSRGraph &graph, const PartitionContext &p_ctx, const InitialPartitioningContext &i_ctx, MemoryContext &)
       : Bipartitioner(graph, p_ctx, i_ctx) {}
 
 protected:
diff --git a/kaminpar-shm/initial_partitioning/seed_node_utils.cc b/kaminpar-shm/initial_partitioning/seed_node_utils.cc
index 5e3c1f53..f3c53002 100644
--- a/kaminpar-shm/initial_partitioning/seed_node_utils.cc
+++ b/kaminpar-shm/initial_partitioning/seed_node_utils.cc
@@ -15,7 +15,7 @@
 
 namespace kaminpar::shm::ip {
 std::pair<NodeID, NodeID>
-find_far_away_nodes(const Graph &graph, const std::size_t num_iterations) {
+find_far_away_nodes(const CSRGraph &graph, const std::size_t num_iterations) {
   Queue<NodeID> queue(graph.n());
   Marker<> marker(graph.n());
 
@@ -36,7 +36,7 @@ find_far_away_nodes(const Graph &graph, const std::size_t num_iterations) {
 }
 
 std::pair<NodeID, NodeID> find_furthest_away_node(
-    const Graph &graph, const NodeID start_node, Queue<NodeID> &queue, Marker<> &marker
+    const CSRGraph &graph, const NodeID start_node, Queue<NodeID> &queue, Marker<> &marker
 ) {
   queue.push_tail(start_node);
   marker.set<true>(start_node);
diff --git a/kaminpar-shm/initial_partitioning/seed_node_utils.h b/kaminpar-shm/initial_partitioning/seed_node_utils.h
index 0feb1b42..237c2130 100644
--- a/kaminpar-shm/initial_partitioning/seed_node_utils.h
+++ b/kaminpar-shm/initial_partitioning/seed_node_utils.h
@@ -9,9 +9,8 @@
 #pragma once
 
 #include <utility>
-#include <vector>
 
-#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
 
 #include "kaminpar-common/datastructures/marker.h"
 #include "kaminpar-common/datastructures/queue.h"
@@ -30,9 +29,10 @@ namespace kaminpar::shm::ip {
  * finding a pair of nodes with even larger distance.
  * @return Pair of nodes with large distance between them.
  */
-std::pair<NodeID, NodeID> find_far_away_nodes(const Graph &graph, std::size_t num_iterations = 1);
+std::pair<NodeID, NodeID>
+find_far_away_nodes(const CSRGraph &graph, std::size_t num_iterations = 1);
 
 std::pair<NodeID, NodeID> find_furthest_away_node(
-    const Graph &graph, NodeID start_node, Queue<NodeID> &queue, Marker<> &marker
+    const CSRGraph &graph, NodeID start_node, Queue<NodeID> &queue, Marker<> &marker
 );
 } // namespace kaminpar::shm::ip
diff --git a/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.cc b/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.cc
index 1f074cba..20c359cb 100644
--- a/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.cc
+++ b/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.cc
@@ -10,20 +10,23 @@
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
 
 namespace kaminpar::shm::ip {
-SequentialGraphHierarchy::SequentialGraphHierarchy(const Graph *finest_graph)
+SequentialGraphHierarchy::SequentialGraphHierarchy(const CSRGraph *finest_graph)
     : _finest_graph(finest_graph) {}
 
-void SequentialGraphHierarchy::take_coarse_graph(Graph &&c_graph, std::vector<NodeID> &&c_mapping) {
+void SequentialGraphHierarchy::take_coarse_graph(
+    CSRGraph &&c_graph, std::vector<NodeID> &&c_mapping
+) {
   KASSERT(coarsest_graph().n() == c_mapping.size());
   _coarse_mappings.push_back(std::move(c_mapping));
   _coarse_graphs.push_back(std::move(c_graph));
 }
 
-[[nodiscard]] const Graph &SequentialGraphHierarchy::coarsest_graph() const {
+[[nodiscard]] const CSRGraph &SequentialGraphHierarchy::coarsest_graph() const {
   return _coarse_graphs.empty() ? *_finest_graph : _coarse_graphs.back();
 }
 
-PartitionedGraph SequentialGraphHierarchy::pop_and_project(PartitionedGraph &&coarse_p_graph) {
+PartitionedCSRGraph SequentialGraphHierarchy::pop_and_project(PartitionedCSRGraph &&coarse_p_graph
+) {
   KASSERT(!_coarse_graphs.empty());
   KASSERT(&_coarse_graphs.back() == &coarse_p_graph.graph());
 
@@ -31,7 +34,7 @@ PartitionedGraph SequentialGraphHierarchy::pop_and_project(PartitionedGraph &&co
   std::vector<NodeID> c_mapping{std::move(_coarse_mappings.back())};
   _coarse_mappings.pop_back();
 
-  const Graph &graph{get_second_coarsest_graph()};
+  const CSRGraph &graph{get_second_coarsest_graph()};
   KASSERT(graph.n() == c_mapping.size());
 
   StaticArray<BlockID> partition{graph.n()};
@@ -42,10 +45,10 @@ PartitionedGraph SequentialGraphHierarchy::pop_and_project(PartitionedGraph &&co
   // This destroys underlying Graph wrapped in p_graph
   _coarse_graphs.pop_back();
 
-  return {PartitionedGraph::seq{}, graph, coarse_p_graph.k(), std::move(partition)};
+  return {PartitionedCSRGraph::seq{}, graph, coarse_p_graph.k(), std::move(partition)};
 }
 
-const Graph &SequentialGraphHierarchy::get_second_coarsest_graph() const {
+const CSRGraph &SequentialGraphHierarchy::get_second_coarsest_graph() const {
   KASSERT(!_coarse_graphs.empty());
   return (_coarse_graphs.size() > 1) ? _coarse_graphs[_coarse_graphs.size() - 2] : *_finest_graph;
 }
diff --git a/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.h b/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.h
index ea363a4a..923e274c 100644
--- a/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.h
+++ b/kaminpar-shm/initial_partitioning/sequential_graph_hierarchy.h
@@ -6,13 +6,13 @@
  ******************************************************************************/
 #pragma once
 
-#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
 
 namespace kaminpar::shm::ip {
 class SequentialGraphHierarchy {
 public:
-  explicit SequentialGraphHierarchy(const Graph *finest_graph);
+  explicit SequentialGraphHierarchy(const CSRGraph *finest_graph);
 
   SequentialGraphHierarchy(const SequentialGraphHierarchy &) = delete;
   SequentialGraphHierarchy &operator=(const SequentialGraphHierarchy &) = delete;
@@ -20,11 +20,11 @@ class SequentialGraphHierarchy {
   SequentialGraphHierarchy(SequentialGraphHierarchy &&) noexcept = default;
   SequentialGraphHierarchy &operator=(SequentialGraphHierarchy &&) noexcept = default;
 
-  void take_coarse_graph(Graph &&c_graph, std::vector<NodeID> &&c_mapping);
+  void take_coarse_graph(CSRGraph &&c_graph, std::vector<NodeID> &&c_mapping);
 
-  [[nodiscard]] const Graph &coarsest_graph() const;
+  [[nodiscard]] const CSRGraph &coarsest_graph() const;
 
-  PartitionedGraph pop_and_project(PartitionedGraph &&coarse_p_graph);
+  PartitionedCSRGraph pop_and_project(PartitionedCSRGraph &&coarse_p_graph);
 
   [[nodiscard]] inline std::size_t size() const {
     return _coarse_graphs.size();
@@ -43,10 +43,10 @@ class SequentialGraphHierarchy {
   }
 
 private:
-  [[nodiscard]] const Graph &get_second_coarsest_graph() const;
+  [[nodiscard]] const CSRGraph &get_second_coarsest_graph() const;
 
-  const Graph *_finest_graph;
+  const CSRGraph *_finest_graph;
   std::vector<std::vector<NodeID>> _coarse_mappings;
-  std::vector<Graph> _coarse_graphs;
+  std::vector<CSRGraph> _coarse_graphs;
 };
 } // namespace kaminpar::shm::ip
diff --git a/kaminpar-shm/kaminpar.cc b/kaminpar-shm/kaminpar.cc
index 81b74b22..194692d0 100644
--- a/kaminpar-shm/kaminpar.cc
+++ b/kaminpar-shm/kaminpar.cc
@@ -15,6 +15,7 @@
 #include "kaminpar-shm/metrics.h"
 
 #include "kaminpar-common/console_io.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/logger.h"
 #include "kaminpar-common/random.h"
 #include "kaminpar-common/timer.h"
@@ -53,6 +54,7 @@ void print_statistics(
   LOG << "Global Timers: disabled";
 #endif // KAMINPAR_ENABLE_TIMERS
   LOG;
+  PRINT_HEAP_PROFILE(std::cout);
   LOG << "Partition summary:";
   if (p_graph.k() != ctx.partition.k) {
     LOG << logger::RED << "  Number of blocks: " << p_graph.k();
@@ -102,35 +104,40 @@ void KaMinPar::take_graph(
 void KaMinPar::borrow_and_mutate_graph(
     const NodeID n, EdgeID *xadj, NodeID *adjncy, NodeWeight *vwgt, EdgeWeight *adjwgt
 ) {
+  SCOPED_HEAP_PROFILER("Borrow and mutate graph");
   SCOPED_TIMER("IO");
 
   const EdgeID m = xadj[n];
 
-  StaticArray<EdgeID> nodes(xadj, n + 1);
-  StaticArray<NodeID> edges(adjncy, m);
+  RECORD("nodes") StaticArray<EdgeID> nodes(xadj, n + 1);
+  RECORD("edges") StaticArray<NodeID> edges(adjncy, m);
+  RECORD("node_weights")
   StaticArray<NodeWeight> node_weights =
       (vwgt == nullptr) ? StaticArray<NodeWeight>(0) : StaticArray<NodeWeight>(vwgt, n);
+  RECORD("edge_weights")
   StaticArray<EdgeWeight> edge_weights =
       (adjwgt == nullptr) ? StaticArray<EdgeWeight>(0) : StaticArray<EdgeWeight>(adjwgt, m);
 
-  _graph_ptr = std::make_unique<Graph>(
+  _was_rearranged = false;
+  _graph_ptr = std::make_unique<Graph>(std::make_unique<CSRGraph>(
       std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), false
-  );
+  ));
 }
 
 void KaMinPar::copy_graph(
     const NodeID n, EdgeID *xadj, NodeID *adjncy, NodeWeight *vwgt, EdgeWeight *adjwgt
 ) {
+  SCOPED_HEAP_PROFILER("Copy graph");
   SCOPED_TIMER("IO");
 
   const EdgeID m = xadj[n];
   const bool has_node_weights = vwgt != nullptr;
   const bool has_edge_weights = adjwgt != nullptr;
 
-  StaticArray<EdgeID> nodes(n + 1);
-  StaticArray<NodeID> edges(m);
-  StaticArray<NodeWeight> node_weights(has_node_weights ? n : 0);
-  StaticArray<EdgeWeight> edge_weights(has_edge_weights ? m : 0);
+  RECORD("nodes") StaticArray<EdgeID> nodes(n + 1);
+  RECORD("edges") StaticArray<NodeID> edges(m);
+  RECORD("node_weights") StaticArray<NodeWeight> node_weights(has_node_weights ? n : 0);
+  RECORD("edge_weights") StaticArray<EdgeWeight> edge_weights(has_edge_weights ? m : 0);
 
   nodes[n] = xadj[n];
   tbb::parallel_for<NodeID>(0, n, [&](const NodeID u) {
@@ -146,9 +153,15 @@ void KaMinPar::copy_graph(
     }
   });
 
-  _graph_ptr = std::make_unique<Graph>(
+  _was_rearranged = false;
+  _graph_ptr = std::make_unique<Graph>(std::make_unique<CSRGraph>(
       std::move(nodes), std::move(edges), std::move(node_weights), std::move(edge_weights), false
-  );
+  ));
+}
+
+void KaMinPar::set_graph(Graph graph) {
+  _was_rearranged = false;
+  _graph_ptr = std::make_unique<Graph>(std::move(graph));
 }
 
 void KaMinPar::reseed(int seed) {
@@ -175,23 +188,56 @@ EdgeWeight KaMinPar::compute_partition(const BlockID k, BlockID *partition) {
     print(_ctx, std::cout);
   }
 
+  START_HEAP_PROFILER("Partitioning");
   START_TIMER("Partitioning");
-  if (_ctx.rearrange_by == GraphOrdering::DEGREE_BUCKETS && !_was_rearranged) {
-    _graph_ptr =
-        std::make_unique<Graph>(graph::rearrange_by_degree_buckets(_ctx, std::move(*_graph_ptr)));
+
+  if (!_was_rearranged) {
+    if (_ctx.node_ordering == NodeOrdering::DEGREE_BUCKETS) {
+      CSRGraph &csr_graph = *dynamic_cast<CSRGraph *>(_graph_ptr->underlying_graph());
+      _graph_ptr = std::make_unique<Graph>(graph::rearrange_by_degree_buckets(csr_graph));
+    }
+
+    if (_ctx.edge_ordering == EdgeOrdering::COMPRESSION && !_ctx.compression.enabled) {
+      CSRGraph &csr_graph = *dynamic_cast<CSRGraph *>(_graph_ptr->underlying_graph());
+      graph::reorder_edges_by_compression(csr_graph);
+    }
+
     _was_rearranged = true;
   }
 
+  // Cut off isolated nodes if the graph has been rearranged such that the isolated nodes are placed
+  // at the end.
+  if (_graph_ptr->sorted()) {
+    graph::remove_isolated_nodes(*_graph_ptr, _ctx.partition);
+  }
+
   // Perform actual partitioning
-  PartitionedGraph p_graph = factory::create_partitioner(*_graph_ptr, _ctx)->partition();
+  PartitionedGraph p_graph = [&] {
+    auto partitioner = factory::create_partitioner(*_graph_ptr, _ctx);
+    if (_output_level >= OutputLevel::DEBUG) {
+      partitioner->enable_metrics_output();
+    }
+    PartitionedGraph p_graph = partitioner->partition();
+
+    START_TIMER("Deallocation");
+    delete partitioner.release();
+    STOP_TIMER();
+
+    return p_graph;
+  }();
 
   // Re-integrate isolated nodes that were cut off during preprocessing
-  if (_graph_ptr->permuted()) {
+  if (_graph_ptr->sorted()) {
+    SCOPED_HEAP_PROFILER("Re-integrate isolated nodes");
+    SCOPED_TIMER("Re-integrate isolated nodes");
+
     const NodeID num_isolated_nodes =
         graph::integrate_isolated_nodes(*_graph_ptr, original_epsilon, _ctx);
     p_graph = graph::assign_isolated_nodes(std::move(p_graph), num_isolated_nodes, _ctx.partition);
   }
+
   STOP_TIMER();
+  STOP_HEAP_PROFILER();
 
   START_TIMER("IO");
   if (_graph_ptr->permuted()) {
@@ -208,7 +254,7 @@ EdgeWeight KaMinPar::compute_partition(const BlockID k, BlockID *partition) {
   // Print some statistics
   STOP_TIMER(); // stop root timer
   if (_output_level >= OutputLevel::APPLICATION) {
-    print_statistics(_ctx, p_graph, _max_timer_depth, _output_level == OutputLevel::EXPERIMENT);
+    print_statistics(_ctx, p_graph, _max_timer_depth, _output_level >= OutputLevel::EXPERIMENT);
   }
 
   const EdgeWeight final_cut = metrics::edge_cut(p_graph);
diff --git a/kaminpar-shm/kaminpar.h b/kaminpar-shm/kaminpar.h
index b531b071..d1718711 100644
--- a/kaminpar-shm/kaminpar.h
+++ b/kaminpar-shm/kaminpar.h
@@ -23,6 +23,7 @@ enum class OutputLevel : std::uint8_t {
   PROGRESS,    //! Continuously output progress information while partitioning.
   APPLICATION, //! Also output the application banner and context summary.
   EXPERIMENT,  //! Also output information only relevant for benchmarking.
+  DEBUG,       //! Also output (a sane amount) of debug information.
 };
 } // namespace kaminpar
 
@@ -61,18 +62,30 @@ constexpr NodeWeight kInvalidNodeWeight = std::numeric_limits<NodeWeight>::max()
 constexpr EdgeWeight kInvalidEdgeWeight = std::numeric_limits<EdgeWeight>::max();
 constexpr BlockWeight kInvalidBlockWeight = std::numeric_limits<BlockWeight>::max();
 
-enum class GraphOrdering {
+enum class NodeOrdering {
   NATURAL,
   DEGREE_BUCKETS,
+  IMPLICIT_DEGREE_BUCKETS
+};
+
+enum class EdgeOrdering {
+  NATURAL,
+  COMPRESSION
 };
 
 //
 // Coarsening
 //
 
+enum class CoarseningAlgorithm {
+  NOOP,
+  CLUSTERING,
+};
+
 enum class ClusteringAlgorithm {
   NOOP,
   LABEL_PROPAGATION,
+  LEGACY_LABEL_PROPAGATION,
 };
 
 enum class ClusterWeightLimit {
@@ -82,6 +95,17 @@ enum class ClusterWeightLimit {
   ZERO,
 };
 
+enum class SecondPhaseSelectMode {
+  HIGH_DEGREE,
+  FULL_RATING_MAP
+};
+
+enum class SecondPhaseAggregationMode {
+  NONE,
+  DIRECT,
+  BUFFERED
+};
+
 enum class TwoHopStrategy {
   DISABLE,
   MATCH,
@@ -99,30 +123,56 @@ enum class IsolatedNodesClusteringStrategy {
   CLUSTER_DURING_TWO_HOP,
 };
 
+enum class ContractionMode {
+  BUFFERED,
+  BUFFERED_LEGACY,
+  UNBUFFERED,
+  UNBUFFERED_NAIVE,
+};
+
 struct LabelPropagationCoarseningContext {
   int num_iterations;
   NodeID large_degree_threshold;
   NodeID max_num_neighbors;
 
+  bool use_two_level_cluster_weight_vector;
+
+  bool use_two_phases;
+  SecondPhaseSelectMode second_phase_select_mode;
+  SecondPhaseAggregationMode second_phase_aggregation_mode;
+  bool relabel_before_second_phase;
+
   TwoHopStrategy two_hop_strategy;
   double two_hop_threshold;
 
   IsolatedNodesClusteringStrategy isolated_nodes_strategy;
 };
 
-struct CoarseningContext {
+struct ContractionCoarseningContext {
+  ContractionMode mode;
+  double edge_buffer_fill_fraction;
+  bool use_compact_mapping;
+};
+
+struct ClusterCoarseningContext {
   ClusteringAlgorithm algorithm;
   LabelPropagationCoarseningContext lp;
-  NodeID contraction_limit;
-  bool enforce_contraction_limit;
-  double convergence_threshold;
+
   ClusterWeightLimit cluster_weight_limit;
   double cluster_weight_multiplier;
 
-  [[nodiscard]] inline bool
-  coarsening_should_converge(const NodeID old_n, const NodeID new_n) const {
-    return (1.0 - 1.0 * new_n / old_n) <= convergence_threshold;
-  }
+  int max_mem_free_coarsening_level;
+};
+
+struct CoarseningContext {
+  CoarseningAlgorithm algorithm;
+
+  ClusterCoarseningContext clustering;
+  ContractionCoarseningContext contraction;
+
+  NodeID contraction_limit;
+
+  double convergence_threshold;
 };
 
 //
@@ -131,6 +181,7 @@ struct CoarseningContext {
 
 enum class RefinementAlgorithm {
   LABEL_PROPAGATION,
+  LEGACY_LABEL_PROPAGATION,
   KWAY_FM,
   GREEDY_BALANCER,
   JET,
@@ -155,6 +206,10 @@ struct LabelPropagationRefinementContext {
   std::size_t num_iterations;
   NodeID large_degree_threshold;
   NodeID max_num_neighbors;
+
+  bool use_two_phases;
+  SecondPhaseSelectMode second_phase_select_mode;
+  SecondPhaseAggregationMode second_phase_aggregation_mode;
 };
 
 struct KwayFMRefinementContext {
@@ -249,6 +304,7 @@ struct InitialPartitioningContext {
 // Application level
 //
 
+class AbstractGraph;
 class Graph;
 struct PartitionContext;
 
@@ -256,14 +312,14 @@ struct BlockWeightsContext {
   void setup(const PartitionContext &ctx);
   void setup(const PartitionContext &ctx, BlockID input_k);
 
-  [[nodiscard]] BlockWeight max(BlockID b) const { 
-      return _max_block_weights[b]; 
+  [[nodiscard]] BlockWeight max(BlockID b) const {
+    return _max_block_weights[b];
   }
 
   [[nodiscard]] const std::vector<BlockWeight> &all_max() const;
 
   [[nodiscard]] BlockWeight perfectly_balanced(BlockID b) const {
-      return _perfectly_balanced_block_weights[b];
+    return _perfectly_balanced_block_weights[b];
   }
 
   [[nodiscard]] const std::vector<BlockWeight> &all_perfectly_balanced() const;
@@ -286,7 +342,7 @@ struct PartitionContext {
   EdgeWeight total_edge_weight = kInvalidEdgeWeight;
   NodeWeight max_node_weight = kInvalidNodeWeight;
 
-  void setup(const Graph &graph);
+  void setup(const AbstractGraph &graph);
 };
 
 struct ParallelContext {
@@ -318,10 +374,36 @@ struct PartitioningContext {
 
   InitialPartitioningMode deep_initial_partitioning_mode;
   double deep_initial_partitioning_load;
+  int min_consecutive_seq_bipartitioning_levels;
+};
+
+struct GraphCompressionContext {
+  bool enabled;
+  bool may_dismiss;
+
+  bool high_degree_encoding;
+  NodeID high_degree_threshold;
+  NodeID high_degree_part_length;
+  bool interval_encoding;
+  NodeID interval_length_treshold;
+  bool run_length_encoding;
+  bool stream_encoding;
+  bool isolated_nodes_separation;
+
+  bool dismissed;
+  double compression_ratio;
+  std::int64_t size_reduction;
+  std::size_t high_degree_count;
+  std::size_t part_count;
+  std::size_t interval_count;
+
+  void setup(const Graph &graph);
 };
 
 struct Context {
-  GraphOrdering rearrange_by;
+  GraphCompressionContext compression;
+  NodeOrdering node_ordering;
+  EdgeOrdering edge_ordering;
 
   PartitioningContext partitioning;
   PartitionContext partition;
@@ -342,8 +424,9 @@ struct Context {
 namespace kaminpar::shm {
 std::unordered_set<std::string> get_preset_names();
 Context create_context_by_preset_name(const std::string &name);
-Context create_fast_context();
 Context create_default_context();
+Context create_memory_context();
+Context create_fast_context();
 Context create_largek_context();
 Context create_strong_context();
 Context create_jet_context();
@@ -441,6 +524,13 @@ class KaMinPar {
       shm::EdgeWeight *const adjwgt
   );
 
+  /*!
+   * Sets the graph to be partitioned.
+   *
+   * @param graph The graph to be partitioned.
+   */
+  void set_graph(shm::Graph graph);
+
   /*!
    * Partitions the graph set by `take_graph()` or `copy_graph()` into `k` blocks.
    *
diff --git a/kaminpar-shm/label_propagation.h b/kaminpar-shm/label_propagation.h
index 1a98a590..c5298024 100644
--- a/kaminpar-shm/label_propagation.h
+++ b/kaminpar-shm/label_propagation.h
@@ -1,35 +1,35 @@
 /*******************************************************************************
  * Generic implementation of parallel label propagation.
  *
- * @file:   parallel_label_propagation.h
+ * @file:   label_propagation.h
  * @author: Daniel Seemaier
  * @date:   21.09.2021
  ******************************************************************************/
 #pragma once
 
 #include <atomic>
+#include <optional>
 #include <type_traits>
 
+#include <tbb/concurrent_vector.h>
 #include <tbb/enumerable_thread_specific.h>
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_invoke.h>
-#include <tbb/scalable_allocator.h>
-
-#include "kaminpar-shm/datastructures/graph.h"
 
 #include "kaminpar-common/assert.h"
+#include "kaminpar-common/datastructures/concurrent_fast_reset_array.h"
+#include "kaminpar-common/datastructures/concurrent_two_level_vector.h"
 #include "kaminpar-common/datastructures/dynamic_map.h"
 #include "kaminpar-common/datastructures/rating_map.h"
-#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/logger.h"
 #include "kaminpar-common/parallel/atomic.h"
 #include "kaminpar-common/random.h"
 #include "kaminpar-common/tags.h"
+#include "kaminpar-common/timer.h"
 
 namespace kaminpar {
 struct LabelPropagationConfig {
-  using Graph = ::kaminpar::shm::Graph;
-
   // Data structure used to accumulate edge weights for gain value calculation
   using RatingMap =
       ::kaminpar::RatingMap<shm::EdgeWeight, shm::NodeID, FastResetArray<shm::EdgeWeight>>;
@@ -51,6 +51,11 @@ struct LabelPropagationConfig {
   // size.
   static constexpr std::size_t kNumberOfNodePermutations = 64;
 
+  // When computing a new cluster for each node in an iteration, nodes that use more than or equal
+  // to the threshold amount of entires in the rating map are processed in a second phase
+  // sequentially.
+  static constexpr std::size_t kRatingMapThreshold = 10000;
+
   // If true, we count the number of empty clusters
   static constexpr bool kTrackClusterCount = false;
 
@@ -70,21 +75,23 @@ struct LabelPropagationConfig {
  * @tparam Derived Derived class for static polymorphism.
  * @tparam Config Algorithmic configuration and data types.
  */
-template <typename Derived, typename Config> class LabelPropagation {
+template <typename Derived, typename Config, typename Graph> class LabelPropagation {
   static_assert(std::is_base_of_v<LabelPropagationConfig, Config>);
 
   SET_DEBUG(false);
-  SET_STATISTICS(false);
+  SET_STATISTICS_FROM_GLOBAL();
 
 protected:
-  using RatingMap = typename Config::RatingMap;
-  using Graph = typename Config::Graph;
   using NodeID = typename Graph::NodeID;
   using NodeWeight = typename Graph::NodeWeight;
   using EdgeID = typename Graph::EdgeID;
   using EdgeWeight = typename Graph::EdgeWeight;
   using ClusterID = typename Config::ClusterID;
   using ClusterWeight = typename Config::ClusterWeight;
+  using RatingMap = typename Config::RatingMap;
+
+  using SecondPhaseSelectMode = shm::SecondPhaseSelectMode;
+  using SecondPhaseAggregationMode = shm::SecondPhaseAggregationMode;
 
 public:
   void set_max_degree(const NodeID max_degree) {
@@ -108,58 +115,118 @@ template <typename Derived, typename Config> class LabelPropagation {
     return _desired_num_clusters;
   }
 
+  void set_use_two_phases(const bool use_two_phases) {
+    _use_two_phases = use_two_phases;
+  }
+  [[nodiscard]] bool use_two_phases() const {
+    return _use_two_phases;
+  }
+
+  void set_second_phase_select_mode(const SecondPhaseSelectMode mode) {
+    _second_phase_select_mode = mode;
+  }
+  [[nodiscard]] SecondPhaseSelectMode second_phase_select_mode() const {
+    return _second_phase_select_mode;
+  }
+
+  void set_second_phase_aggregation_mode(const SecondPhaseAggregationMode mode) {
+    _second_phase_aggregation_mode = mode;
+  }
+  [[nodiscard]] SecondPhaseAggregationMode second_phase_aggregation_mode() const {
+    return _second_phase_aggregation_mode;
+  }
+
+  void set_relabel_before_second_phase(const bool relabel) {
+    _relabel_before_second_phase = relabel;
+  }
+  [[nodiscard]] bool relabel_before_second_phase() const {
+    return _relabel_before_second_phase;
+  }
+
   [[nodiscard]] EdgeWeight expected_total_gain() const {
     return _expected_total_gain;
   }
 
 protected:
   /*!
-   * (Re)allocates memory to run label propagation on a graph with \c num_nodes
-   * nodes.
+   * Selects the number of nodes \c num_nodes of the graph for which a clustering is to be
+   * computed and the number of clusters \c num_clusters.
+   *
    * @param num_nodes Number of nodes in the graph.
+   * @param num_clusters The number of clusters.
    */
-  void allocate(const NodeID num_nodes, const ClusterID num_clusters) {
-    allocate(num_nodes, num_nodes, num_clusters);
+  void preinitialize(const NodeID num_nodes, const ClusterID num_clusters) {
+    preinitialize(num_nodes, num_nodes, num_clusters);
   }
 
   /*!
-   * (Re)allocates memory to run label propagation on a graph with \c num_nodes
-   * nodes in total, but a clustering is only computed for the first \c
-   * num_active_nodes nodes.
+   * Selects the number of nodes \c num_nodes of the graph for which a clustering is to be
+   * computed, but a clustering is only computed for the first \c num_active_nodes nodes, and the
+   * number of clusters \c num_clusters.
    *
-   * This is mostly useful for distributed graphs where ghost nodes are always
-   * inactive.
+   * This is mostly useful for distributed graphs where ghost nodes are always inactive.
    *
-   * @param num_nodes Total number of nodes in the graph, i.e., neighbors of
-   * active nodes have an ID less than this.
-   * @param num_active_nodes Number of nodes for which a cluster label is
-   * computed.
+   * @param num_nodes Number of nodes in the graph.
+   * @param num_active_nodes Number of nodes for which a cluster label is computed.
+   * @param num_clusters The number of clusters.
    */
-  void allocate(const NodeID num_nodes, const NodeID num_active_nodes, const NodeID num_clusters) {
-    if (_num_nodes < num_nodes) {
-      if constexpr (Config::kUseLocalActiveSetStrategy) {
-        _active.resize(num_nodes);
+  void preinitialize(
+      const NodeID num_nodes, const NodeID num_active_nodes, const ClusterID num_clusters
+  ) {
+    _num_nodes = num_nodes;
+    _num_active_nodes = num_active_nodes;
+    _prev_num_clusters = _num_clusters;
+    _num_clusters = num_clusters;
+  }
+
+  /*!
+   * (Re)allocates memory to run label propagation on. Must be called after \c preinitialize().
+   */
+  void allocate() {
+    if constexpr (Config::kUseLocalActiveSetStrategy) {
+      if (_active.size() < _num_nodes) {
+        _active.resize(_num_nodes);
       }
-      _num_nodes = num_nodes;
     }
 
-    if (_num_active_nodes < num_active_nodes) {
-      if constexpr (Config::kUseActiveSetStrategy) {
-        _active.resize(num_active_nodes);
+    if constexpr (Config::kUseActiveSetStrategy) {
+      if (_active.size() < _num_active_nodes) {
+        _active.resize(_num_active_nodes);
       }
-      if constexpr (Config::kUseTwoHopClustering) {
-        _favored_clusters.resize(num_active_nodes);
+    }
+
+    if constexpr (Config::kUseTwoHopClustering) {
+      if (_favored_clusters.size() < _num_active_nodes) {
+        _favored_clusters.resize(_num_active_nodes);
       }
-      _num_active_nodes = num_active_nodes;
     }
-    if (_num_clusters < num_clusters) {
+
+    if (_rating_map_ets.empty() || _prev_num_clusters < _num_clusters) {
+      _rating_map_ets =
+          tbb::enumerable_thread_specific<RatingMap>([&_num_clusters = _num_clusters] {
+            return RatingMap(_num_clusters);
+          });
+    } else {
       for (auto &rating_map : _rating_map_ets) {
-        rating_map.change_max_size(num_clusters);
+        rating_map.change_max_size(_num_clusters);
       }
-      _num_clusters = num_clusters;
     }
   }
 
+  void free() {
+    // No shrink-to-fit call is needed (and provided by the ets-interface) since the clear already
+    // frees the memory.
+    _rating_map_ets.clear();
+    _prev_num_clusters = 0;
+
+    _active.free();
+    _favored_clusters.free();
+    _moved.free();
+
+    _second_phase_nodes.clear();
+    _second_phase_nodes.shrink_to_fit();
+  }
+
   /*!
    * Initialize label propagation. Must be called after \c allocate().
    * @param graph Graph for label propagation.
@@ -192,44 +259,131 @@ template <typename Derived, typename Config> class LabelPropagation {
     return false;
   }
 
+  /*!
+   * Relabel the clusters such that afterwards the cluster IDs are consecutive in the range of [0,
+   * num_actual_clusters]; num_actual_clusters is thereby the number of clusters that have at least
+   * one member.
+   */
+  void relabel_clusters() {
+    SCOPED_HEAP_PROFILER("Relabel");
+    SCOPED_TIMER("Relabel");
+
+    // Update initial num clusters since the maximum cluster ID is now different.
+    ClusterID num_actual_clusters = _current_num_clusters;
+    _initial_num_clusters = num_actual_clusters;
+
+    // Store for each node whether it joined another cluster as this information gets lost. This
+    // information is needed only by 2-hop clustering.
+    if constexpr (Config::kUseTwoHopClustering) {
+      if (_moved.size() < _graph->n()) {
+        _moved.resize(_graph->n());
+      }
+    }
+
+    // Compute a mapping from old cluster IDs to new cluster IDs.
+    RECORD("mapping") StaticArray<ClusterID> mapping(_graph->n());
+    tbb::parallel_for(tbb::blocked_range<NodeID>(0, _graph->n()), [&](const auto &r) {
+      for (NodeID u = r.begin(); u != r.end(); ++u) {
+        const ClusterID c_u = derived_cluster(u);
+        __atomic_store_n(&mapping[c_u], 1, __ATOMIC_RELAXED);
+
+        if constexpr (Config::kUseTwoHopClustering) {
+          if (u != c_u) {
+            _moved[u] = 1;
+          }
+        }
+      }
+    });
+
+    parallel::prefix_sum(mapping.begin(), mapping.end(), mapping.begin());
+    KASSERT(num_actual_clusters == mapping[_graph->n() - 1]);
+
+    tbb::parallel_invoke(
+        // Relabel the cluster stored for each node.
+        [&] {
+          tbb::parallel_for(tbb::blocked_range<NodeID>(0, _graph->n()), [&](const auto &r) {
+            for (NodeID u = r.begin(); u != r.end(); ++u) {
+              derived_move_node(u, mapping[derived_cluster(u)] - 1);
+            }
+          });
+        },
+        // Relabel the clusters stored in the favored clusters vector.
+        [&] {
+          tbb::parallel_for(tbb::blocked_range<NodeID>(0, _graph->n()), [&](const auto &r) {
+            for (NodeID u = r.begin(); u != r.end(); ++u) {
+              _favored_clusters[u] = mapping[_favored_clusters[u]] - 1;
+            }
+          });
+        },
+        // Reassign the clusters weights such that they match the new cluster IDs.
+        [&] {
+          static_cast<Derived *>(this)->reassign_cluster_weights(mapping, num_actual_clusters);
+        }
+    );
+    _relabeled = true;
+  }
+
   /*!
    * Move a single node to a new cluster.
    *
    * @param u The node that is moved.
-   * @param local_rand Thread-local \c Random object.
-   * @param local_rating_map Thread-local rating map for gain computation.
+   * @param rand (Thread-local) \c Random object.
+   * @param rating_map (Thread-local) Rating map for gain computation.
    * @return Pair with: whether the node was moved to another cluster, whether
    * the previous cluster is now empty.
    */
-  template <typename LocalRatingMap>
-  std::pair<bool, bool>
-  handle_node(const NodeID u, Random &local_rand, LocalRatingMap &local_rating_map) {
+  template <bool first_phase = true, typename RatingMap>
+  std::pair<bool, bool> handle_node(const NodeID u, Random &rand, RatingMap &rating_map) {
     if (derived_skip_node(u)) {
       return {false, false};
     }
 
     const NodeWeight u_weight = _graph->node_weight(u);
     const ClusterID u_cluster = derived_cluster(u);
-    const auto [new_cluster, new_gain] =
-        find_best_cluster(u, u_weight, u_cluster, local_rand, local_rating_map);
-
-    if (derived_cluster(u) != new_cluster) {
-      if (derived_move_cluster_weight(
-              u_cluster, new_cluster, u_weight, derived_max_cluster_weight(new_cluster)
-          )) {
-        derived_move_node(u, new_cluster);
-        activate_neighbors(u);
-        IFSTATS(_expected_total_gain += new_gain);
-
-        const bool decrement_cluster_count =
-            Config::kTrackClusterCount && derived_cluster_weight(u_cluster) == 0;
-        // do not update _current_num_clusters here to avoid fetch_add()
-        return {true, decrement_cluster_count}; // did move, did reduce nonempty
-                                                // cluster count?
+
+    auto move_node = [&](const ClusterID new_cluster, EdgeWeight gain) -> std::pair<bool, bool> {
+      if (derived_cluster(u) != new_cluster) {
+        const bool successful_weight_move = derived_move_cluster_weight(
+            u_cluster, new_cluster, u_weight, derived_max_cluster_weight(new_cluster)
+        );
+
+        if (successful_weight_move) {
+          derived_move_node(u, new_cluster);
+          activate_neighbors(u);
+          IFSTATS(_expected_total_gain += gain);
+
+          const bool decrement_cluster_count =
+              Config::kTrackClusterCount && derived_cluster_weight(u_cluster) == 0;
+          // do not update _current_num_clusters here to avoid fetch_add()
+          return {true, decrement_cluster_count}; // did move, did reduce nonempty
+                                                  // cluster count?
+        }
       }
+
+      // did not move, did not reduce cluster count
+      return {false, false};
+    };
+
+    if constexpr (first_phase) {
+      std::size_t upper_bound_size = std::min<ClusterID>(_graph->degree(u), _initial_num_clusters);
+      if (_use_two_phases && _second_phase_select_mode == SecondPhaseSelectMode::FULL_RATING_MAP) {
+        upper_bound_size = std::min(upper_bound_size, Config::kRatingMapThreshold);
+      }
+
+      const auto opt_move = rating_map.execute(upper_bound_size, [&](auto &map) {
+        return find_best_cluster(u, u_weight, u_cluster, rand, map);
+      });
+
+      if (opt_move.has_value()) {
+        const auto [best_cluster, gain] = *opt_move;
+        return move_node(best_cluster, gain);
+      }
+    } else {
+      const auto [best_cluster, gain] =
+          *find_best_cluster<false>(u, u_weight, u_cluster, rand, rating_map);
+      return move_node(best_cluster, gain);
     }
 
-    // did not move, did not reduce cluster count
     return {false, false};
   }
 
@@ -253,76 +407,160 @@ template <typename Derived, typename Config> class LabelPropagation {
    * @param u The node for which the cluster is computed.
    * @param u_weight The weight of the node.
    * @param u_cluster The current cluster of the node.
-   * @param local_rand Thread-local \c Random object.
-   * @param local_rating_map Thread-local rating map to compute gain values.
+   * @param rand (Thread-local) \c Random object.
+   * @param rating_map (Thread-local) rating map to compute gain values.
    * @return Pair with: new cluster of the node, gain value for the move to the
    * new cluster.
    */
-  template <typename LocalRatingMap>
-  std::pair<ClusterID, EdgeWeight> find_best_cluster(
+  template <bool first_phase = true, typename RatingMap>
+  std::optional<std::pair<ClusterID, EdgeWeight>> find_best_cluster(
       const NodeID u,
       const NodeWeight u_weight,
       const ClusterID u_cluster,
-      Random &local_rand,
-      LocalRatingMap &local_rating_map
+      Random &rand,
+      RatingMap &map
   ) {
-    auto action = [&](auto &map) {
-      const ClusterWeight initial_cluster_weight = derived_cluster_weight(u_cluster);
-      ClusterSelectionState state{
-          .local_rand = local_rand,
-          .u = u,
-          .u_weight = u_weight,
-          .initial_cluster = u_cluster,
-          .initial_cluster_weight = initial_cluster_weight,
-          .best_cluster = u_cluster,
-          .best_gain = 0,
-          .best_cluster_weight = initial_cluster_weight,
-          .current_cluster = 0,
-          .current_gain = 0,
-          .current_cluster_weight = 0,
-      };
+    const ClusterWeight initial_cluster_weight = derived_cluster_weight(u_cluster);
+    ClusterSelectionState state{
+        .local_rand = rand,
+        .u = u,
+        .u_weight = u_weight,
+        .initial_cluster = u_cluster,
+        .initial_cluster_weight = initial_cluster_weight,
+        .best_cluster = u_cluster,
+        .best_gain = 0,
+        .best_cluster_weight = initial_cluster_weight,
+        .current_cluster = 0,
+        .current_gain = 0,
+        .current_cluster_weight = 0,
+    };
+
+    bool is_interface_node = false;
 
-      bool is_interface_node = false;
+    if constexpr (first_phase) {
+      const bool use_frm_selection =
+          _use_two_phases && _second_phase_select_mode == SecondPhaseSelectMode::FULL_RATING_MAP;
+      const bool aggregate_during_second_phase =
+          _second_phase_aggregation_mode != SecondPhaseAggregationMode::NONE;
 
-      auto add_to_rating_map = [&](const EdgeID e, const NodeID v) {
+      bool second_phase_node = false;
+      _graph->neighbors(u, _max_num_neighbors, [&](const EdgeID e, const NodeID v) {
         if (derived_accept_neighbor(u, v)) {
           const ClusterID v_cluster = derived_cluster(v);
           const EdgeWeight rating = _graph->edge_weight(e);
+
           map[v_cluster] += rating;
+
+          if (use_frm_selection && map.size() >= Config::kRatingMapThreshold) {
+            if (aggregate_during_second_phase) {
+              _second_phase_nodes.push_back(u);
+            }
+
+            second_phase_node = true;
+            return true;
+          }
+
           if constexpr (Config::kUseLocalActiveSetStrategy) {
             is_interface_node |= v >= _num_active_nodes;
           }
         }
-      };
 
-      const EdgeID from = _graph->first_edge(u);
-      const EdgeID to = from + std::min(_graph->degree(u), _max_num_neighbors);
-      for (EdgeID e = from; e < to; ++e) {
-        add_to_rating_map(e, _graph->edge_target(e));
+        return false;
+      });
+
+      if (second_phase_node) {
+        map.clear();
+        return std::nullopt;
       }
+    } else {
+      switch (_second_phase_aggregation_mode) {
+      case SecondPhaseAggregationMode::DIRECT: {
+        _graph->pfor_neighbors(u, _max_num_neighbors, 2000, [&](const EdgeID e, const NodeID v) {
+          if (derived_accept_neighbor(u, v)) {
+            const ClusterID v_cluster = derived_cluster(v);
+            const EdgeWeight rating = _graph->edge_weight(e);
+
+            const EdgeWeight prev_rating =
+                __atomic_fetch_add(&map[v_cluster], rating, __ATOMIC_RELAXED);
+
+            if (prev_rating == 0) {
+              map.local_used_entries().push_back(v_cluster);
+            }
 
-      if constexpr (Config::kUseLocalActiveSetStrategy) {
-        if (!is_interface_node) {
-          _active[u] = 0;
-        }
+            if constexpr (Config::kUseLocalActiveSetStrategy) {
+              is_interface_node |= v >= _num_active_nodes;
+            }
+          }
+        });
+        break;
       }
-      if constexpr (Config::kUseActiveSetStrategy) {
-        _active[u] = 0;
+      case SecondPhaseAggregationMode::BUFFERED: {
+        const auto flush_local_rating_map = [&](auto &local_rating_map) {
+          for (const auto [cluster, rating] : local_rating_map.entries()) {
+            const EdgeWeight prev_rating =
+                __atomic_fetch_add(&map[cluster], rating, __ATOMIC_RELAXED);
+
+            if (prev_rating == 0) {
+              map.local_used_entries().push_back(cluster);
+            }
+          }
+
+          local_rating_map.clear();
+        };
+
+        _graph->pfor_neighbors(u, _max_num_neighbors, 2000, [&](auto &&local_pfor_neighbors) {
+          auto &local_rating_map = _rating_map_ets.local().small_map();
+
+          local_pfor_neighbors([&](const EdgeID e, const NodeID v) {
+            if (derived_accept_neighbor(u, v)) {
+              const ClusterID v_cluster = derived_cluster(v);
+              const EdgeWeight rating = _graph->edge_weight(e);
+
+              local_rating_map[v_cluster] += rating;
+
+              if (local_rating_map.size() >= Config::kRatingMapThreshold) {
+                flush_local_rating_map(local_rating_map);
+              }
+            }
+          });
+        });
+
+        tbb::parallel_for(_rating_map_ets.range(), [&](auto &rating_maps) {
+          for (auto &rating_map : rating_maps) {
+            auto &local_rating_map = rating_map.small_map();
+            flush_local_rating_map(local_rating_map);
+          }
+        });
+        break;
       }
+      case SecondPhaseAggregationMode::NONE:
+        __builtin_unreachable();
+      }
+    }
 
-      // After LP, we might want to use 2-hop clustering to merge nodes that
-      // could not find any cluster to join for this, we store a favored cluster
-      // for each node u if:
-      // (1) we actually use 2-hop clustering
-      // (2) u is still in a singleton cluster (weight of node == weight of cluster)
-      // (3) the cluster is light (at most half full)
-      ClusterID favored_cluster = u_cluster;
-      const bool store_favored_cluster =
-          Config::kUseTwoHopClustering && u_weight == initial_cluster_weight &&
-          initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2;
+    if constexpr (Config::kUseLocalActiveSetStrategy) {
+      if (!is_interface_node) {
+        __atomic_store_n(&_active[u], 0, __ATOMIC_RELAXED);
+      }
+    }
+    if constexpr (Config::kUseActiveSetStrategy) {
+      __atomic_store_n(&_active[u], 0, __ATOMIC_RELAXED);
+    }
 
-      const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0;
+    // After LP, we might want to use 2-hop clustering to merge nodes that
+    // could not find any cluster to join for this, we store a favored cluster
+    // for each node u if:
+    // (1) we actually use 2-hop clustering
+    // (2) u is still in a singleton cluster (weight of node == weight of cluster)
+    // (3) the cluster is light (at most half full)
+    ClusterID favored_cluster = u_cluster;
+    const bool store_favored_cluster =
+        Config::kUseTwoHopClustering && u_weight == initial_cluster_weight &&
+        initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2;
 
+    const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0;
+
+    if constexpr (first_phase) {
       for (const auto [cluster, rating] : map.entries()) {
         state.current_cluster = cluster;
         state.current_gain = rating - gain_delta;
@@ -338,22 +576,79 @@ template <typename Derived, typename Config> class LabelPropagation {
           state.best_gain = state.current_gain;
         }
       }
+    } else {
+      std::vector<std::tuple<EdgeWeight, ClusterID, EdgeWeight, ClusterID>> local_values;
+      local_values.resize(tbb::this_task_arena::max_concurrency());
+
+      map.iterate_and_reset([&](const auto i, const auto &used_entries) {
+        ClusterSelectionState local_state{
+            .local_rand = Random::instance(),
+            .u = u,
+            .u_weight = u_weight,
+            .initial_cluster = u_cluster,
+            .initial_cluster_weight = initial_cluster_weight,
+            .best_cluster = u_cluster,
+            .best_gain = 0,
+            .best_cluster_weight = initial_cluster_weight,
+            .current_cluster = 0,
+            .current_gain = 0,
+            .current_cluster_weight = 0,
+        };
+        ClusterID local_favored_cluster_gain = 0;
+        ClusterID local_favored_cluster = u_cluster;
+
+        for (const ClusterID cluster : used_entries) {
+          const EdgeWeight rating = map[cluster];
+
+          local_state.current_cluster = cluster;
+          local_state.current_gain = rating - gain_delta;
+          local_state.current_cluster_weight = derived_cluster_weight(cluster);
+
+          if (store_favored_cluster && local_state.current_gain > local_state.best_gain) {
+            local_favored_cluster_gain = local_state.current_cluster;
+            local_favored_cluster = local_state.current_cluster;
+          }
 
-      // if we couldn't join any cluster, we store the favored cluster
-      if (store_favored_cluster && state.best_cluster == state.initial_cluster) {
-        _favored_clusters[u] = favored_cluster;
-      }
+          if (derived_accept_cluster(local_state)) {
+            local_state.best_cluster = local_state.current_cluster;
+            local_state.best_cluster_weight = local_state.current_cluster_weight;
+            local_state.best_gain = local_state.current_gain;
+          }
+        }
 
-      const EdgeWeight actual_gain = IFSTATS(state.best_gain - map[state.initial_cluster]);
-      map.clear();
-      return std::make_pair(state.best_cluster, actual_gain);
-    };
+        local_values[i] = std::make_tuple(
+            local_state.best_gain,
+            local_state.best_cluster,
+            local_favored_cluster_gain,
+            local_favored_cluster
+        );
+      });
 
-    const auto [best_cluster, gain] = local_rating_map.execute(
-        std::min<ClusterID>(_graph->degree(u), _initial_num_clusters), action
-    );
+      EdgeWeight favored_cluster_gain = 0;
+      for (auto const [best_gain, best_cluster, local_favored_cluster_gain, local_favored_cluster] :
+           local_values) {
+        if (best_gain > state.best_gain) {
+          state.best_gain = best_gain;
+          state.best_cluster = best_cluster;
+        }
 
-    return {best_cluster, gain};
+        if (store_favored_cluster && local_favored_cluster_gain > favored_cluster_gain) {
+          favored_cluster_gain = local_favored_cluster_gain;
+          favored_cluster = local_favored_cluster;
+        }
+      }
+    }
+
+    // if we couldn't join any cluster, we store the favored cluster
+    if (store_favored_cluster && state.best_cluster == state.initial_cluster) {
+      _favored_clusters[u] = favored_cluster;
+    }
+
+    const EdgeWeight actual_gain = IFSTATS(state.best_gain - map[state.initial_cluster]);
+    if constexpr (first_phase) {
+      map.clear();
+    }
+    return std::make_pair(state.best_cluster, actual_gain);
   }
 
   /*!
@@ -362,16 +657,16 @@ template <typename Derived, typename Config> class LabelPropagation {
    * @param u Node that was moved.
    */
   void activate_neighbors(const NodeID u) {
-    for (const NodeID v : _graph->adjacent_nodes(u)) {
+    _graph->adjacent_nodes(u, [&](const NodeID v) {
       // call derived_activate_neighbor() even if we do not use the active set
       // strategy since the function might have side effects; the compiler
       // should remove it if it does not side effects
       if (derived_activate_neighbor(v)) {
         if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
-          _active[v].store(1, std::memory_order_relaxed);
+          __atomic_store_n(&_active[v], 1, __ATOMIC_RELAXED);
         }
       }
-    }
+    });
   }
 
   void match_isolated_nodes(
@@ -442,37 +737,54 @@ template <typename Derived, typename Config> class LabelPropagation {
     tbb::enumerable_thread_specific<DynamicFlatMap<ClusterID, NodeID>> matching_map_ets;
 
     auto is_considered_for_two_hop_clustering = [&](const NodeID u) {
-      // Skip nodes not considered for two-hop clustering
+      // Not considered: isolated node
       if (_graph->degree(u) == 0) {
-        // Not considered: isolated node
-        return false;
-      } else if (u != derived_cluster(u)) {
-        // Not considered: joined another cluster
         return false;
-      } else {
-        // If u did not join another cluster, there could still be other nodes that joined this
-        // node's cluster: find out by checking the cluster weight
-        const ClusterWeight current_weight = derived_cluster_weight(u);
-        if (current_weight > derived_max_cluster_weight(u) / 2 ||
-            current_weight != derived_initial_cluster_weight(u)) {
+      }
+
+      // If u did not join another cluster, there could still be other nodes that joined this
+      // node's cluster: find out by checking the cluster weight
+      auto check_cluster_weight = [&](const NodeID c_u) {
+        const ClusterWeight current_weight = derived_cluster_weight(c_u);
+
+        if (current_weight > derived_max_cluster_weight(c_u) / 2 ||
+            current_weight != derived_initial_cluster_weight(c_u)) {
           // Not considered: not a singleton cluster; or its weight is too heavy
           return false;
         }
-      }
 
-      return true;
+        return true;
+      };
+
+      // Not considered: joined another cluster
+      if (_relabeled) {
+        if (_moved[u]) {
+          return false;
+        }
+
+        const ClusterID c_u = derived_cluster(u);
+        return check_cluster_weight(c_u);
+      } else {
+        if (u != derived_cluster(u)) {
+          return false;
+        }
+
+        // In this case c_u == u holds.
+        return check_cluster_weight(u);
+      }
     };
 
     auto handle_node = [&](DynamicFlatMap<ClusterID, NodeID> &matching_map, const NodeID u) {
+      const ClusterID c_u = derived_cluster(u);
       ClusterID &rep_key = matching_map[_favored_clusters[u]];
 
       if (rep_key == 0) {
-        rep_key = u + 1;
+        rep_key = c_u + 1;
       } else {
         const ClusterID rep = rep_key - 1;
 
         const bool could_move_u_to_rep = derived_move_cluster_weight(
-            u, rep, derived_cluster_weight(u), derived_max_cluster_weight(rep)
+            c_u, rep, derived_cluster_weight(c_u), derived_max_cluster_weight(rep)
         );
 
         if constexpr (match) {
@@ -483,7 +795,7 @@ template <typename Derived, typename Config> class LabelPropagation {
           if (could_move_u_to_rep) {
             derived_move_node(u, rep);
           } else {
-            rep_key = u + 1;
+            rep_key = c_u + 1;
           }
         }
       }
@@ -613,14 +925,16 @@ template <typename Derived, typename Config> class LabelPropagation {
       // Conclusion:
       // We can use _favored_clusters[u] to build the two-hop clusters.
 
-      const NodeID C = _favored_clusters[u];
+      const NodeID C = __atomic_load_n(&_favored_clusters[u], __ATOMIC_RELAXED);
       auto &sync = _favored_clusters[C];
 
       do {
         NodeID cluster = sync;
 
         if (cluster == C) {
-          if (sync.compare_exchange_strong(cluster, u)) {
+          if (__atomic_compare_exchange_n(
+                  &sync, &cluster, u, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST
+              )) {
             // We are done: other nodes will join our cluster
             break;
           }
@@ -631,14 +945,16 @@ template <typename Derived, typename Config> class LabelPropagation {
 
         // Invariant: cluster is a node with favored cluster C
         KASSERT(
-            _favored_clusters[cluster] == C,
+            __atomic_load_n(&_favored_clusters[cluster], __ATOMIC_RELAXED) == C,
             "invariant violated by: " << V(u) << V(cluster) << V(C) << V(_favored_clusters[C])
         );
 
         // Try to join the cluster:
         if constexpr (match) {
           // Matching mode: try to build a cluster only containing nodes "cluster" and "u"
-          if (sync.compare_exchange_strong(cluster, C)) {
+          if (__atomic_compare_exchange_n(
+                  &sync, &cluster, C, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST
+              )) {
             [[maybe_unused]] const bool success = derived_move_cluster_weight(
                 u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster)
             );
@@ -664,8 +980,10 @@ template <typename Derived, typename Config> class LabelPropagation {
 
             // We are done: joined cluster "cluster"
             break;
-          } else if (sync.compare_exchange_strong(cluster, u)) {
-            // We are done: other nodes will join our cluster
+          } else if (__atomic_compare_exchange_n(
+                         &sync, &cluster, C, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST
+                     )) {
+            // We are done: start a new cluster
             break;
           }
         }
@@ -699,6 +1017,7 @@ template <typename Derived, typename Config> class LabelPropagation {
     );
     IFSTATS(_expected_total_gain = 0);
     _current_num_clusters = _initial_num_clusters;
+    _relabeled = false;
   }
 
 private: // CRTP calls
@@ -798,9 +1117,8 @@ template <typename Derived, typename Config> class LabelPropagation {
   }
 
 protected: // Members
-  //! Graph we operate on, or \c nullptr if \c initialize has not been called
-  //! yet.
-  const Graph *_graph{nullptr};
+  //! Graph we operate on, or \c nullptr if \c initialize has not been called yet.
+  const Graph *_graph = nullptr;
 
   //! The number of non-empty clusters before we ran the first iteration of
   //! label propagation.
@@ -824,20 +1142,43 @@ template <typename Derived, typename Config> class LabelPropagation {
   //! ignored.
   NodeID _max_num_neighbors = std::numeric_limits<NodeID>::max();
 
+  //! Uses two phases in each iteration, where the first phase iterates in parallel over the
+  //! low-degree nodes and the second phase iterates sequentially over the high-degree nodes but in
+  //! parallel over their neighbors.
+  bool _use_two_phases{false};
+
+  //! The mode by which the nodes for the second phase are selected.
+  SecondPhaseSelectMode _second_phase_select_mode;
+
+  //! The mode by which the ratings for nodes in the second phase are aggregated.
+  SecondPhaseAggregationMode _second_phase_aggregation_mode;
+
+  //! Whether to relabel the clusters before the second phase.
+  bool _relabel_before_second_phase;
+
   //! Thread-local map to compute gain values.
-  tbb::enumerable_thread_specific<RatingMap> _rating_map_ets{[this] {
-    return RatingMap(_num_clusters);
-  }};
+  tbb::enumerable_thread_specific<RatingMap> _rating_map_ets;
 
   //! Flags nodes with at least one node in its neighborhood that changed
   //! clusters during the last iteration. Nodes without this flag set must not
   //! be considered in the next iteration.
-  scalable_vector<parallel::Atomic<uint8_t>> _active;
+  StaticArray<std::uint8_t> _active;
+
+  //! Flags nodes that joined another cluster. This information is used during 2-hop clustering when
+  //! we relabel the clusters.
+  StaticArray<std::uint8_t> _moved;
+
+  //! Store whether we relabeled the clusters and thus have to use the information of the _moved
+  //! array for 2-hop clustering.
+  bool _relabeled;
 
   //! If a node cannot join any cluster during an iteration, this vector stores
   //! the node's highest rated cluster independent of the maximum cluster
   //! weight. This information is used during 2-hop clustering.
-  scalable_vector<parallel::Atomic<ClusterID>> _favored_clusters;
+  StaticArray<ClusterID> _favored_clusters;
+
+  //! The nodes which should be processed in the second phase.
+  tbb::concurrent_vector<NodeID> _second_phase_nodes;
 
   //! If statistics are enabled, this is the sum of the gain of all moves that
   //! were performed. If executed single-thread, this should be equal to the
@@ -848,6 +1189,7 @@ template <typename Derived, typename Config> class LabelPropagation {
   NodeID _num_nodes = 0;
   NodeID _num_active_nodes = 0;
   ClusterID _num_clusters = 0;
+  ClusterID _prev_num_clusters = 0;
 };
 
 /*!
@@ -856,21 +1198,20 @@ template <typename Derived, typename Config> class LabelPropagation {
  * @tparam Derived Derived subclass for static polymorphism.
  * @tparam Config Algorithmic configuration and data types.
  */
-template <typename Derived, typename Config>
-class InOrderLabelPropagation : public LabelPropagation<Derived, Config> {
+template <typename Derived, typename Config, typename Graph>
+class InOrderLabelPropagation : public LabelPropagation<Derived, Config, Graph> {
   static_assert(std::is_base_of_v<LabelPropagationConfig, Config>);
-  SET_DEBUG(true);
+  SET_DEBUG(false);
 
-protected:
-  using Base = LabelPropagation<Derived, Config>;
+  using Base = LabelPropagation<Derived, Config, Graph>;
 
-  using Graph = typename Base::Graph;
-  using ClusterID = typename Base::ClusterID;
-  using ClusterWeight = typename Base::ClusterWeight;
-  using EdgeID = typename Base::EdgeID;
-  using EdgeWeight = typename Base::EdgeWeight;
+protected:
   using NodeID = typename Base::NodeID;
   using NodeWeight = typename Base::NodeWeight;
+  using EdgeID = typename Base::EdgeID;
+  using EdgeWeight = typename Base::EdgeWeight;
+  using ClusterID = typename Base::ClusterID;
+  using ClusterWeight = typename Base::ClusterWeight;
 
   using Base::handle_node;
   using Base::set_max_degree;
@@ -897,7 +1238,7 @@ class InOrderLabelPropagation : public LabelPropagation<Derived, Config> {
             }
 
             if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
-              if (!_active[u].load(std::memory_order_relaxed)) {
+              if (!__atomic_load_n(&_active[u], __ATOMIC_RELAXED)) {
                 continue;
               }
             }
@@ -934,33 +1275,122 @@ class InOrderLabelPropagation : public LabelPropagation<Derived, Config> {
   using Base::_rating_map_ets;
 };
 
+template <typename NodeID> struct AbstractChunk {
+  NodeID start;
+  NodeID end;
+};
+
+struct Bucket {
+  std::size_t start;
+  std::size_t end;
+};
+
 /*!
  * Parallel label propagation template that iterates over nodes in chunk random
  * order.
  * @tparam Derived Derived subclass for static polymorphism.
  * @tparam Config Algorithmic configuration and data types.
  */
-template <typename Derived, typename Config>
-class ChunkRandomdLabelPropagation : public LabelPropagation<Derived, Config> {
-  using Base = LabelPropagation<Derived, Config>;
+template <typename Derived, typename Config, typename Graph>
+class ChunkRandomLabelPropagation : public LabelPropagation<Derived, Config, Graph> {
   static_assert(std::is_base_of_v<LabelPropagationConfig, Config>);
-
   SET_DEBUG(false);
 
+  using Base = LabelPropagation<Derived, Config, Graph>;
+
 protected:
-  using Graph = typename Base::Graph;
-  using ClusterID = typename Base::ClusterID;
-  using ClusterWeight = typename Base::ClusterWeight;
-  using EdgeID = typename Base::EdgeID;
-  using EdgeWeight = typename Base::EdgeWeight;
   using NodeID = typename Base::NodeID;
   using NodeWeight = typename Base::NodeWeight;
+  using EdgeID = typename Base::EdgeID;
+  using EdgeWeight = typename Base::EdgeWeight;
+  using ClusterID = typename Base::ClusterID;
+  using ClusterWeight = typename Base::ClusterWeight;
+  using RatingMap = typename Base::RatingMap;
+
+  using SecondPhaseSelectMode = Base::SecondPhaseSelectMode;
+  using SecondPhaseAggregationMode = Base::SecondPhaseAggregationMode;
 
   using Base::handle_node;
+  using Base::relabel_clusters;
   using Base::set_max_degree;
   using Base::set_max_num_neighbors;
   using Base::should_stop;
 
+  using Permutations =
+      RandomPermutations<NodeID, Config::kPermutationSize, Config::kNumberOfNodePermutations>;
+  using Chunk = AbstractChunk<NodeID>;
+
+public:
+  //! The data strucutres that are stored on the heap and used by label propagation.
+  using DataStructures = std::tuple<
+      tbb::enumerable_thread_specific<RatingMap>,
+      StaticArray<uint8_t>,
+      StaticArray<uint8_t>,
+      StaticArray<ClusterID>,
+      tbb::concurrent_vector<NodeID>,
+      std::vector<Chunk>,
+      std::vector<Bucket>,
+      ConcurrentFastResetArray<EdgeWeight, ClusterID>>;
+
+  /*!
+   * Sets the data structures to use, which can save memory space when (unused) data structures are
+   * already in memory.
+   *
+   * @param structs The data structures to use.
+   */
+  void setup(DataStructures structs) {
+    auto
+        [rating_map_ets,
+         active,
+         moved,
+         favored_clusters,
+         second_phase_nodes,
+         chunks,
+         buckets,
+         concurrent_rating_map] = std::move(structs);
+    Base::_rating_map_ets = std::move(rating_map_ets);
+    Base::_active = std::move(active);
+    Base::_moved = std::move(moved);
+    Base::_favored_clusters = std::move(favored_clusters);
+    Base::_second_phase_nodes = std::move(second_phase_nodes);
+    _chunks = std::move(chunks);
+    _buckets = std::move(buckets);
+    _concurrent_rating_map = std::move(concurrent_rating_map);
+  }
+
+  /*!
+   * Returns ownership of the data structures that are stored on the heap.
+   *
+   * @return Ownership of the data structures that are stored on the heap.
+   */
+  DataStructures release() {
+    return std::make_tuple(
+        std::move(Base::_rating_map_ets),
+        std::move(Base::_active),
+        std::move(Base::_moved),
+        std::move(Base::_favored_clusters),
+        std::move(Base::_second_phase_nodes),
+        std::move(_chunks),
+        std::move(_buckets),
+        std::move(_concurrent_rating_map)
+    );
+  }
+
+protected:
+  ChunkRandomLabelPropagation(Permutations &permutations) : _random_permutations(permutations) {}
+
+  void free() {
+    Base::free();
+
+    _chunks.clear();
+    _chunks.shrink_to_fit();
+
+    _buckets.clear();
+    _buckets.shrink_to_fit();
+
+    _concurrent_rating_map.free();
+  }
+
   void initialize(const Graph *graph, const ClusterID num_clusters) {
     Base::initialize(graph, num_clusters);
     _chunks.clear();
@@ -974,7 +1404,7 @@ class ChunkRandomdLabelPropagation : public LabelPropagation<Derived, Config> {
    * The randomization works in multiple steps:
    * - Nodes within the iteration order are split into chunks of consecutive
    * nodes. The size of each chunk is determined by
-   * LabelPropagationConfig::kMinChunkSize, which is a lower bound on the sum of
+   * LegacyLabelPropagationConfig::kMinChunkSize, which is a lower bound on the sum of
    * the degrees assigned to a chunk (nodes are assigned to a chunk until the
    * limit is exceeded).
    * - Afterwards, the order of chunk is shuffled.
@@ -999,72 +1429,40 @@ class ChunkRandomdLabelPropagation : public LabelPropagation<Derived, Config> {
     }
     shuffle_chunks();
 
-    tbb::enumerable_thread_specific<NodeID> num_moved_nodes_ets;
-    parallel::Atomic<std::size_t> next_chunk = 0;
+    const NodeID initial_num_clusters = _initial_num_clusters;
+    const auto [num_processed_nodes, num_moved_nodes_first_phase] = perform_first_phase();
 
-    tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](const std::size_t) {
-      if (should_stop()) {
-        return;
+    const NodeID num_second_phase_nodes = _second_phase_nodes.size();
+    if (num_second_phase_nodes > 0) {
+      if (_relabel_before_second_phase) {
+        relabel_clusters();
       }
 
-      auto &local_num_moved_nodes = num_moved_nodes_ets.local();
-      auto &local_rand = Random::instance();
-      auto &local_rating_map = _rating_map_ets.local();
-      NodeID num_removed_clusters = 0;
-
-      const auto chunk_id = next_chunk.fetch_add(1, std::memory_order_relaxed);
-      const auto &chunk = _chunks[chunk_id];
-      const auto &permutation = _random_permutations.get(local_rand);
-
-      const std::size_t num_sub_chunks =
-          std::ceil(1.0 * (chunk.end - chunk.start) / Config::kPermutationSize);
-      std::vector<NodeID> sub_chunk_permutation(num_sub_chunks);
-      std::iota(sub_chunk_permutation.begin(), sub_chunk_permutation.end(), 0);
-      local_rand.shuffle(sub_chunk_permutation);
+      perform_second_phase();
+    }
 
-      for (std::size_t sub_chunk = 0; sub_chunk < num_sub_chunks; ++sub_chunk) {
-        for (std::size_t i = 0; i < Config::kPermutationSize; ++i) {
-          const NodeID u = chunk.start +
-                           Config::kPermutationSize * sub_chunk_permutation[sub_chunk] +
-                           permutation[i % Config::kPermutationSize];
-          if (u < chunk.end && _graph->degree(u) < _max_degree &&
-              ((!Config::kUseActiveSetStrategy && !Config::kUseLocalActiveSetStrategy) ||
-               _active[u].load(std::memory_order_relaxed))) {
-            const auto [moved_node, emptied_cluster] = handle_node(u, local_rand, local_rating_map);
-            if (moved_node) {
-              ++local_num_moved_nodes;
-            }
-            if (emptied_cluster) {
-              ++num_removed_clusters;
-            }
-          }
-        }
+    const NodeID num_moved_nodes = _num_moved_nodes_ets.combine(std::plus{});
+    if constexpr (kDebug) {
+      LOG << "Label Propagation";
+      LOG << " Initial clusters: " << initial_num_clusters << " clusters";
+      LOG << " First Phase:";
+      LOG << "  Processed: " << (num_processed_nodes - num_second_phase_nodes) << " nodes";
+      LOG << "  Moved: " << num_moved_nodes_first_phase << " nodes";
+      if (_relabel_before_second_phase) {
+        LOG << " Clusters after relabeling: " << _initial_num_clusters << " clusters";
       }
+      LOG << " Second Phase:";
+      LOG << "  Processed: " << num_second_phase_nodes << " nodes";
+      LOG << "  Moved: " << (num_moved_nodes - num_moved_nodes_first_phase) << " nodes";
+      LOG;
+    }
 
-      _current_num_clusters -= num_removed_clusters;
-    });
-
-    return num_moved_nodes_ets.combine(std::plus{});
+    _num_processed_nodes_ets.clear();
+    _num_moved_nodes_ets.clear();
+    return num_moved_nodes;
   }
 
 private:
-  struct Chunk {
-    NodeID start;
-    NodeID end;
-  };
-
-  struct Bucket {
-    std::size_t start;
-    std::size_t end;
-  };
-
-  void shuffle_chunks() {
-    tbb::parallel_for<std::size_t>(0, _buckets.size(), [&](const std::size_t i) {
-      const auto &bucket = _buckets[i];
-      Random::instance().shuffle(_chunks.begin() + bucket.start, _chunks.begin() + bucket.end);
-    });
-  }
-
   void init_chunks(const NodeID from, NodeID to) {
     _chunks.clear();
     _buckets.clear();
@@ -1187,125 +1585,291 @@ class ChunkRandomdLabelPropagation : public LabelPropagation<Derived, Config> {
     );
   }
 
+  void shuffle_chunks() {
+    tbb::parallel_for<std::size_t>(0, _buckets.size(), [&](const std::size_t i) {
+      const auto &bucket = _buckets[i];
+      Random::instance().shuffle(_chunks.begin() + bucket.start, _chunks.begin() + bucket.end);
+    });
+  }
+
+  std::pair<std::size_t, std::size_t> perform_first_phase() {
+    SCOPED_HEAP_PROFILER("First phase");
+    SCOPED_TIMER("First phase");
+
+    const bool use_high_degree_selection =
+        _use_two_phases && _initial_num_clusters >= Config::kRatingMapThreshold &&
+        _second_phase_select_mode == SecondPhaseSelectMode::HIGH_DEGREE;
+    const bool aggregate_during_second_phase =
+        _second_phase_aggregation_mode != SecondPhaseAggregationMode::NONE;
+
+    parallel::Atomic<std::size_t> next_chunk = 0;
+    tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](const std::size_t) {
+      if (should_stop()) {
+        return;
+      }
+
+      auto &local_num_processed_nodes = _num_processed_nodes_ets.local();
+      auto &local_num_moved_nodes = _num_moved_nodes_ets.local();
+      auto &local_rand = Random::instance();
+      auto &local_rating_map = _rating_map_ets.local();
+      NodeID num_removed_clusters = 0;
+
+      const auto chunk_id = next_chunk.fetch_add(1, std::memory_order_relaxed);
+      const auto &chunk = _chunks[chunk_id];
+      const auto &permutation = _random_permutations.get(local_rand);
+
+      const std::size_t num_sub_chunks =
+          std::ceil(1.0 * (chunk.end - chunk.start) / Config::kPermutationSize);
+      std::vector<NodeID> sub_chunk_permutation(num_sub_chunks);
+      std::iota(sub_chunk_permutation.begin(), sub_chunk_permutation.end(), 0);
+      local_rand.shuffle(sub_chunk_permutation);
+
+      for (std::size_t sub_chunk = 0; sub_chunk < num_sub_chunks; ++sub_chunk) {
+        for (std::size_t i = 0; i < Config::kPermutationSize; ++i) {
+          const NodeID u = chunk.start +
+                           Config::kPermutationSize * sub_chunk_permutation[sub_chunk] +
+                           permutation[i % Config::kPermutationSize];
+          if (u >= chunk.end) {
+            continue;
+          }
+
+          if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
+            if (!__atomic_load_n(&_active[u], __ATOMIC_RELAXED)) {
+              continue;
+            }
+          }
+
+          const NodeID degree = _graph->degree(u);
+          if (degree < _max_degree) {
+            ++local_num_processed_nodes;
+
+            if (use_high_degree_selection && degree >= Config::kRatingMapThreshold) {
+              if (aggregate_during_second_phase) {
+                _second_phase_nodes.push_back(u);
+              }
+
+              continue;
+            }
+
+            const auto [moved_node, emptied_cluster] = handle_node(u, local_rand, local_rating_map);
+            if (moved_node) {
+              ++local_num_moved_nodes;
+
+              if (_relabeled) {
+                _moved[u] = 1;
+              }
+            }
+            if (emptied_cluster) {
+              ++num_removed_clusters;
+            }
+          }
+        }
+      }
+
+      _current_num_clusters -= num_removed_clusters;
+    });
+
+    return std::make_pair(
+        _num_processed_nodes_ets.combine(std::plus{}), _num_moved_nodes_ets.combine(std::plus{})
+    );
+  }
+
+  void perform_second_phase() {
+    SCOPED_HEAP_PROFILER("Second phase");
+    SCOPED_TIMER("Second phase");
+
+    const std::size_t num_clusters = _initial_num_clusters;
+    if (_concurrent_rating_map.capacity() < num_clusters) {
+      _concurrent_rating_map.resize(num_clusters);
+    }
+
+    auto &num_moved_nodes = _num_moved_nodes_ets.local();
+    auto &rand = Random::instance();
+    for (const NodeID u : _second_phase_nodes) {
+      const auto [moved_node, emptied_cluster] =
+          Base::template handle_node<false>(u, rand, _concurrent_rating_map);
+
+      if (moved_node) {
+        ++num_moved_nodes;
+
+        if (_relabeled) {
+          _moved[u] = 1;
+        }
+      }
+
+      if (emptied_cluster) {
+        --_current_num_clusters;
+      }
+    }
+
+    _second_phase_nodes.clear();
+  }
+
 protected:
   using Base::_active;
   using Base::_current_num_clusters;
   using Base::_graph;
+  using Base::_initial_num_clusters;
   using Base::_max_degree;
+  using Base::_moved;
   using Base::_rating_map_ets;
-
-  RandomPermutations<NodeID, Config::kPermutationSize, Config::kNumberOfNodePermutations>
-      _random_permutations{};
+  using Base::_relabel_before_second_phase;
+  using Base::_relabeled;
+  using Base::_second_phase_aggregation_mode;
+  using Base::_second_phase_nodes;
+  using Base::_second_phase_select_mode;
+  using Base::_use_two_phases;
+
+  Permutations &_random_permutations;
   std::vector<Chunk> _chunks;
   std::vector<Bucket> _buckets;
+  tbb::enumerable_thread_specific<NodeID> _num_processed_nodes_ets;
+  tbb::enumerable_thread_specific<NodeID> _num_moved_nodes_ets;
+  ConcurrentFastResetArray<EdgeWeight, ClusterID> _concurrent_rating_map;
 };
 
-template <typename NodeID, typename ClusterID> class NonatomicOwnedClusterVector {
+template <typename ClusterID, typename ClusterWeight> class OwnedRelaxedClusterWeightVector {
+  using FirstLevelClusterWeight = typename std::
+      conditional_t<std::is_same_v<ClusterWeight, std::int32_t>, std::int16_t, std::int32_t>;
+
+  using ClusterWeightVec = StaticArray<ClusterWeight>;
+  using ClusterWeightTwoLevelVec =
+      ConcurrentTwoLevelVector<ClusterWeight, ClusterID, FirstLevelClusterWeight>;
+
 public:
-  explicit NonatomicOwnedClusterVector(const NodeID max_num_nodes) : _clusters(max_num_nodes) {
-    tbb::parallel_for<NodeID>(0, max_num_nodes, [&](const NodeID u) { _clusters[u] = 0; });
-  }
+  using ClusterWeights = std::pair<ClusterWeightVec, ClusterWeightTwoLevelVec>;
 
-  [[nodiscard]] auto &&take_clusters() {
-    return std::move(_clusters);
-  }
+  OwnedRelaxedClusterWeightVector(const bool use_two_level_vector)
+      : _use_two_level_vector(use_two_level_vector) {}
 
-  [[nodiscard]] auto &clusters() {
-    return _clusters;
+  void allocate_cluster_weights(const ClusterID num_clusters) {
+    if (_use_two_level_vector) {
+      if (_two_level_cluster_weights.capacity() < num_clusters) {
+        _two_level_cluster_weights.resize(num_clusters);
+      }
+    } else {
+      if (_cluster_weights.size() < num_clusters) {
+        _cluster_weights.resize(num_clusters);
+      }
+    }
   }
 
-  void init_cluster(const NodeID node, const ClusterID cluster) {
-    move_node(node, cluster);
+  void free() {
+    if (_use_two_level_vector) {
+      _two_level_cluster_weights.free();
+    } else {
+      _cluster_weights.free();
+    }
   }
 
-  [[nodiscard]] ClusterID cluster(const NodeID node) {
-    KASSERT(node < _clusters.size());
-    return __atomic_load_n(&_clusters[node], __ATOMIC_RELAXED);
+  void setup_cluster_weights(ClusterWeights weights) {
+    auto [cluster_weights, two_level_cluster_weights] = std::move(weights);
+    _cluster_weights = std::move(cluster_weights);
+    _two_level_cluster_weights = std::move(two_level_cluster_weights);
   }
 
-  void move_node(const NodeID node, const ClusterID cluster) {
-    KASSERT(node < _clusters.size());
-    __atomic_store_n(&_clusters[node], cluster, __ATOMIC_RELAXED);
+  ClusterWeights take_cluster_weights() {
+    return std::make_pair(std::move(_cluster_weights), std::move(_two_level_cluster_weights));
   }
 
-  void ensure_cluster_size(const NodeID max_num_nodes) {
-    if (_clusters.size() < max_num_nodes) {
-      _clusters.resize(max_num_nodes);
+  void reset_cluster_weights() {
+    if (_use_two_level_vector) {
+      _two_level_cluster_weights.reset();
     }
   }
 
-private:
-  NoinitVector<ClusterID> _clusters;
-};
-
-template <typename NodeID, typename ClusterID> class OwnedClusterVector {
-public:
-  explicit OwnedClusterVector(const NodeID max_num_nodes) : _clusters(max_num_nodes) {}
-
-  [[nodiscard]] auto &&take_clusters() {
-    return std::move(_clusters);
+  void init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) {
+    if (_use_two_level_vector) {
+      _two_level_cluster_weights.insert(cluster, weight);
+    } else {
+      _cluster_weights[cluster] = weight;
+    }
   }
 
-  [[nodiscard]] auto &clusters() {
-    return _clusters;
+  ClusterWeight cluster_weight(const ClusterID cluster) {
+    if (_use_two_level_vector) {
+      return _two_level_cluster_weights[cluster];
+    } else {
+      return __atomic_load_n(&_cluster_weights[cluster], __ATOMIC_RELAXED);
+    }
   }
 
-  void init_cluster(const NodeID node, const ClusterID cluster) {
-    _clusters[node] = cluster;
-  }
+  bool move_cluster_weight(
+      const ClusterID old_cluster,
+      const ClusterID new_cluster,
+      const ClusterWeight delta,
+      const ClusterWeight max_weight
+  ) {
+    if (_use_two_level_vector) {
+      if (_two_level_cluster_weights[new_cluster] + delta <= max_weight) {
+        _two_level_cluster_weights.atomic_add(new_cluster, delta);
+        _two_level_cluster_weights.atomic_sub(old_cluster, delta);
+        return true;
+      }
+    } else {
+      if (_cluster_weights[new_cluster] + delta <= max_weight) {
+        __atomic_fetch_add(&_cluster_weights[new_cluster], delta, __ATOMIC_RELAXED);
+        __atomic_fetch_sub(&_cluster_weights[old_cluster], delta, __ATOMIC_RELAXED);
+        return true;
+      }
+    }
 
-  [[nodiscard]] ClusterID cluster(const NodeID node) {
-    KASSERT(node < _clusters.size());
-    return _clusters[node];
+    return false;
   }
 
-  void move_node(const NodeID node, const ClusterID cluster) {
-    KASSERT(node < _clusters.size());
-    _clusters[node] = cluster;
-  }
+  void reassign_cluster_weights(
+      const StaticArray<ClusterID> &mapping, const ClusterID num_new_clusters
+  ) {
+    if (_use_two_level_vector) {
+      _two_level_cluster_weights.reassign(mapping, num_new_clusters);
+    } else {
+      RECORD("new_cluster_weights") ClusterWeightVec new_cluster_weights(num_new_clusters);
 
-  void ensure_cluster_size(const NodeID max_num_nodes) {
-    if (_clusters.size() < max_num_nodes) {
-      _clusters.resize(max_num_nodes);
+      tbb::parallel_for(
+          tbb::blocked_range<ClusterID>(0, _cluster_weights.size()),
+          [&](const auto &r) {
+            for (ClusterID u = r.begin(); u != r.end(); ++u) {
+              ClusterWeight weight = _cluster_weights[u];
+
+              if (weight != 0) {
+                ClusterID new_cluster_id = mapping[u] - 1;
+                new_cluster_weights[new_cluster_id] = weight;
+              }
+            }
+          }
+      );
+
+      _cluster_weights = std::move(new_cluster_weights);
     }
   }
 
 private:
-  scalable_vector<parallel::Atomic<ClusterID>> _clusters;
+  const bool _use_two_level_vector;
+  ClusterWeightVec _cluster_weights;
+  ClusterWeightTwoLevelVec _two_level_cluster_weights;
 };
 
-template <typename ClusterID, typename ClusterWeight> class OwnedRelaxedClusterWeightVector {
+template <typename NodeID, typename ClusterID> class NonatomicClusterVectorRef {
 public:
-  explicit OwnedRelaxedClusterWeightVector(const ClusterID max_num_clusters)
-      : _cluster_weights(max_num_clusters) {}
-
-  auto &&take_cluster_weights() {
-    return std::move(_cluster_weights);
+  void init_clusters_ref(StaticArray<NodeID> &clustering) {
+    _clusters = &clustering;
   }
 
-  void init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) {
-    _cluster_weights[cluster] = weight;
+  void init_cluster(const NodeID node, const ClusterID cluster) {
+    move_node(node, cluster);
   }
 
-  ClusterWeight cluster_weight(const ClusterID cluster) {
-    return _cluster_weights[cluster];
+  [[nodiscard]] ClusterID cluster(const NodeID node) {
+    KASSERT(node < _clusters->size());
+    return __atomic_load_n(&_clusters->at(node), __ATOMIC_RELAXED);
   }
 
-  bool move_cluster_weight(
-      const ClusterID old_cluster,
-      const ClusterID new_cluster,
-      const ClusterWeight delta,
-      const ClusterWeight max_weight
-  ) {
-    if (_cluster_weights[new_cluster] + delta <= max_weight) {
-      _cluster_weights[new_cluster].fetch_add(delta, std::memory_order_relaxed);
-      _cluster_weights[old_cluster].fetch_sub(delta, std::memory_order_relaxed);
-      return true;
-    }
-    return false;
+  void move_node(const NodeID node, const ClusterID cluster) {
+    KASSERT(node < _clusters->size());
+    __atomic_store_n(&_clusters->at(node), cluster, __ATOMIC_RELAXED);
   }
 
 private:
-  scalable_vector<parallel::Atomic<ClusterWeight>> _cluster_weights;
+  StaticArray<ClusterID> *_clusters = nullptr;
 };
 } // namespace kaminpar
diff --git a/kaminpar-shm/legacy_label_propagation.h b/kaminpar-shm/legacy_label_propagation.h
new file mode 100644
index 00000000..15f58682
--- /dev/null
+++ b/kaminpar-shm/legacy_label_propagation.h
@@ -0,0 +1,1268 @@
+/*******************************************************************************
+ * Generic implementation of parallel label propagation.
+ *
+ * @file:   parallel_label_propagation.h
+ * @author: Daniel Seemaier
+ * @date:   21.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include <atomic>
+#include <type_traits>
+
+#include <tbb/enumerable_thread_specific.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_invoke.h>
+
+#include "kaminpar-shm/datastructures/csr_graph.h"
+
+#include "kaminpar-common/assert.h"
+#include "kaminpar-common/datastructures/dynamic_map.h"
+#include "kaminpar-common/datastructures/rating_map.h"
+#include "kaminpar-common/logger.h"
+#include "kaminpar-common/parallel/atomic.h"
+#include "kaminpar-common/random.h"
+#include "kaminpar-common/tags.h"
+
+namespace kaminpar {
+struct LegacyLabelPropagationConfig {
+  using CSRGraph = ::kaminpar::shm::CSRGraph;
+
+  // Data structure used to accumulate edge weights for gain value calculation
+  using RatingMap =
+      ::kaminpar::RatingMap<shm::EdgeWeight, shm::NodeID, FastResetArray<shm::EdgeWeight>>;
+
+  // Data type for cluster IDs and weights
+  using ClusterID = tag::Mandatory;
+  using ClusterWeight = tag::Mandatory;
+
+  // Approx. number of edges per work unit
+  static constexpr shm::NodeID kMinChunkSize = 1024;
+
+  // Nodes per permutation unit: when iterating over nodes in a chunk, we divide
+  // them into permutation units, iterate over permutation orders in random
+  // order, and iterate over nodes inside a permutation unit in random order.
+  static constexpr shm::NodeID kPermutationSize = 64;
+
+  // When randomizing the node order inside a permutation unit, we pick a random
+  // permutation from a pool of permutations. This constant determines the pool
+  // size.
+  static constexpr std::size_t kNumberOfNodePermutations = 64;
+
+  // If true, we count the number of empty clusters
+  static constexpr bool kTrackClusterCount = false;
+
+  // If true, match singleton clusters in 2-hop distance
+  static constexpr bool kUseTwoHopClustering = false;
+
+  static constexpr bool kUseActualGain = false;
+
+  static constexpr bool kUseActiveSetStrategy = true;
+  static constexpr bool kUseLocalActiveSetStrategy = false;
+};
+
+/*!
+ * Generic implementation of parallel label propagation. To use, inherit from
+ * this class and implement all mandatory template functions.
+ *
+ * @tparam Derived Derived class for static polymorphism.
+ * @tparam Config Algorithmic configuration and data types.
+ */
+template <typename Derived, typename Config> class LegacyLabelPropagation {
+  static_assert(std::is_base_of_v<LegacyLabelPropagationConfig, Config>);
+
+  SET_DEBUG(false);
+  SET_STATISTICS_FROM_GLOBAL();
+
+protected:
+  using RatingMap = typename Config::RatingMap;
+  using CSRGraph = typename Config::CSRGraph;
+  using NodeID = typename CSRGraph::NodeID;
+  using NodeWeight = typename CSRGraph::NodeWeight;
+  using EdgeID = typename CSRGraph::EdgeID;
+  using EdgeWeight = typename CSRGraph::EdgeWeight;
+  using ClusterID = typename Config::ClusterID;
+  using ClusterWeight = typename Config::ClusterWeight;
+
+public:
+  void set_max_degree(const NodeID max_degree) {
+    _max_degree = max_degree;
+  }
+  [[nodiscard]] NodeID max_degree() const {
+    return _max_degree;
+  }
+
+  void set_max_num_neighbors(const ClusterID max_num_neighbors) {
+    _max_num_neighbors = max_num_neighbors;
+  }
+  [[nodiscard]] ClusterID max_num_neighbors() const {
+    return _max_num_neighbors;
+  }
+
+  void set_desired_num_clusters(const ClusterID desired_num_clusters) {
+    _desired_num_clusters = desired_num_clusters;
+  }
+  [[nodiscard]] ClusterID desired_num_clusters() const {
+    return _desired_num_clusters;
+  }
+
+  [[nodiscard]] EdgeWeight expected_total_gain() const {
+    return _expected_total_gain;
+  }
+
+protected:
+  /*!
+   * (Re)allocates memory to run label propagation on a graph with \c num_nodes
+   * nodes.
+   * @param num_nodes Number of nodes in the graph.
+   */
+  void allocate(const NodeID num_nodes, const ClusterID num_clusters) {
+    allocate(num_nodes, num_nodes, num_clusters);
+  }
+
+  /*!
+   * (Re)allocates memory to run label propagation on a graph with \c num_nodes
+   * nodes in total, but a clustering is only computed for the first \c
+   * num_active_nodes nodes.
+   *
+   * This is mostly useful for distributed graphs where ghost nodes are always
+   * inactive.
+   *
+   * @param num_nodes Total number of nodes in the graph, i.e., neighbors of
+   * active nodes have an ID less than this.
+   * @param num_active_nodes Number of nodes for which a cluster label is
+   * computed.
+   */
+  void allocate(const NodeID num_nodes, const NodeID num_active_nodes, const NodeID num_clusters) {
+    if (_num_nodes < num_nodes) {
+      if constexpr (Config::kUseLocalActiveSetStrategy) {
+        _active.resize(num_nodes);
+      }
+      _num_nodes = num_nodes;
+    }
+
+    if (_num_active_nodes < num_active_nodes) {
+      if constexpr (Config::kUseActiveSetStrategy) {
+        _active.resize(num_active_nodes);
+      }
+      if constexpr (Config::kUseTwoHopClustering) {
+        _favored_clusters.resize(num_active_nodes);
+      }
+      _num_active_nodes = num_active_nodes;
+    }
+    if (_num_clusters < num_clusters) {
+      for (auto &rating_map : _rating_map_ets) {
+        rating_map.change_max_size(num_clusters);
+      }
+      _num_clusters = num_clusters;
+    }
+  }
+
+  /*!
+   * Initialize label propagation. Must be called after \c allocate().
+   * @param graph CSRGraph for label propagation.
+   * @param num_clusters Number of different clusters the nodes are placed in
+   * initially. When using label propagation as refinement graphutils, this is
+   * usually the number of blocks. When using as for clustering, it is usually
+   * the number of nodes.
+   */
+  void initialize(const CSRGraph *graph, const ClusterID num_clusters) {
+    KASSERT(
+        graph->n() == 0 || (_num_nodes > 0u && _num_active_nodes > 0u),
+        "you must call allocate() before initialize()"
+    );
+
+    _graph = graph;
+    _initial_num_clusters = num_clusters;
+    _current_num_clusters = num_clusters;
+    reset_state();
+  }
+
+  /*!
+   * Determines whether we should stop label propagation because the number of
+   * non-empty clusters has been reduced sufficiently.
+   * @return Whether label propagation should be stopped now.
+   */
+  bool should_stop() {
+    if (Config::kTrackClusterCount) {
+      return _current_num_clusters <= _desired_num_clusters;
+    }
+    return false;
+  }
+
+  /*!
+   * Move a single node to a new cluster.
+   *
+   * @param u The node that is moved.
+   * @param local_rand Thread-local \c Random object.
+   * @param local_rating_map Thread-local rating map for gain computation.
+   * @return Pair with: whether the node was moved to another cluster, whether
+   * the previous cluster is now empty.
+   */
+  template <typename LocalRatingMap>
+  std::pair<bool, bool>
+  handle_node(const NodeID u, Random &local_rand, LocalRatingMap &local_rating_map) {
+    if (derived_skip_node(u)) {
+      return {false, false};
+    }
+
+    const NodeWeight u_weight = _graph->node_weight(u);
+    const ClusterID u_cluster = derived_cluster(u);
+    const auto [new_cluster, new_gain] =
+        find_best_cluster(u, u_weight, u_cluster, local_rand, local_rating_map);
+
+    if (derived_cluster(u) != new_cluster) {
+      if (derived_move_cluster_weight(
+              u_cluster, new_cluster, u_weight, derived_max_cluster_weight(new_cluster)
+          )) {
+        derived_move_node(u, new_cluster);
+        activate_neighbors(u);
+        IFSTATS(_expected_total_gain += new_gain);
+
+        const bool decrement_cluster_count =
+            Config::kTrackClusterCount && derived_cluster_weight(u_cluster) == 0;
+        // do not update _current_num_clusters here to avoid fetch_add()
+        return {true, decrement_cluster_count}; // did move, did reduce nonempty
+                                                // cluster count?
+      }
+    }
+
+    // did not move, did not reduce cluster count
+    return {false, false};
+  }
+
+  struct ClusterSelectionState {
+    Random &local_rand;
+    NodeID u;
+    NodeWeight u_weight;
+    ClusterID initial_cluster;
+    ClusterWeight initial_cluster_weight;
+    ClusterID best_cluster;
+    EdgeWeight best_gain;
+    ClusterWeight best_cluster_weight;
+    ClusterID current_cluster;
+    EdgeWeight current_gain;
+    ClusterWeight current_cluster_weight;
+  };
+
+  /*!
+   * Computes the best feasible cluster for a node.
+   *
+   * @param u The node for which the cluster is computed.
+   * @param u_weight The weight of the node.
+   * @param u_cluster The current cluster of the node.
+   * @param local_rand Thread-local \c Random object.
+   * @param local_rating_map Thread-local rating map to compute gain values.
+   * @return Pair with: new cluster of the node, gain value for the move to the
+   * new cluster.
+   */
+  template <typename LocalRatingMap>
+  std::pair<ClusterID, EdgeWeight> find_best_cluster(
+      const NodeID u,
+      const NodeWeight u_weight,
+      const ClusterID u_cluster,
+      Random &local_rand,
+      LocalRatingMap &local_rating_map
+  ) {
+    auto action = [&](auto &map) {
+      const ClusterWeight initial_cluster_weight = derived_cluster_weight(u_cluster);
+      ClusterSelectionState state{
+          .local_rand = local_rand,
+          .u = u,
+          .u_weight = u_weight,
+          .initial_cluster = u_cluster,
+          .initial_cluster_weight = initial_cluster_weight,
+          .best_cluster = u_cluster,
+          .best_gain = 0,
+          .best_cluster_weight = initial_cluster_weight,
+          .current_cluster = 0,
+          .current_gain = 0,
+          .current_cluster_weight = 0,
+      };
+
+      bool is_interface_node = false;
+
+      auto add_to_rating_map = [&](const EdgeID e, const NodeID v) {
+        if (derived_accept_neighbor(u, v)) {
+          const ClusterID v_cluster = derived_cluster(v);
+          const EdgeWeight rating = _graph->edge_weight(e);
+          map[v_cluster] += rating;
+          if constexpr (Config::kUseLocalActiveSetStrategy) {
+            is_interface_node |= v >= _num_active_nodes;
+          }
+        }
+      };
+
+      const EdgeID from = _graph->first_edge(u);
+      const EdgeID to = from + std::min(_graph->degree(u), _max_num_neighbors);
+      for (EdgeID e = from; e < to; ++e) {
+        add_to_rating_map(e, _graph->edge_target(e));
+      }
+
+      if constexpr (Config::kUseLocalActiveSetStrategy) {
+        if (!is_interface_node) {
+          __atomic_store_n(&_active[u], 0, __ATOMIC_RELAXED);
+        }
+      }
+      if constexpr (Config::kUseActiveSetStrategy) {
+        __atomic_store_n(&_active[u], 0, __ATOMIC_RELAXED);
+      }
+
+      // After LP, we might want to use 2-hop clustering to merge nodes that
+      // could not find any cluster to join for this, we store a favored cluster
+      // for each node u if:
+      // (1) we actually use 2-hop clustering
+      // (2) u is still in a singleton cluster (weight of node == weight of cluster)
+      // (3) the cluster is light (at most half full)
+      ClusterID favored_cluster = u_cluster;
+      const bool store_favored_cluster =
+          Config::kUseTwoHopClustering && u_weight == initial_cluster_weight &&
+          initial_cluster_weight <= derived_max_cluster_weight(u_cluster) / 2;
+
+      const EdgeWeight gain_delta = (Config::kUseActualGain) ? map[u_cluster] : 0;
+
+      for (const auto [cluster, rating] : map.entries()) {
+        state.current_cluster = cluster;
+        state.current_gain = rating - gain_delta;
+        state.current_cluster_weight = derived_cluster_weight(cluster);
+
+        if (store_favored_cluster && state.current_gain > state.best_gain) {
+          favored_cluster = state.current_cluster;
+        }
+
+        if (derived_accept_cluster(state)) {
+          state.best_cluster = state.current_cluster;
+          state.best_cluster_weight = state.current_cluster_weight;
+          state.best_gain = state.current_gain;
+        }
+      }
+
+      // if we couldn't join any cluster, we store the favored cluster
+      if (store_favored_cluster && state.best_cluster == state.initial_cluster) {
+        _favored_clusters[u] = favored_cluster;
+      }
+
+      const EdgeWeight actual_gain = IFSTATS(state.best_gain - map[state.initial_cluster]);
+      map.clear();
+      return std::make_pair(state.best_cluster, actual_gain);
+    };
+
+    const auto [best_cluster, gain] = local_rating_map.execute(
+        std::min<ClusterID>(_graph->degree(u), _initial_num_clusters), action
+    );
+
+    return {best_cluster, gain};
+  }
+
+  /*!
+   * Flags neighbors of a node that has been moved as active.
+   *
+   * @param u Node that was moved.
+   */
+  void activate_neighbors(const NodeID u) {
+    for (const NodeID v : _graph->adjacent_nodes(u)) {
+      // call derived_activate_neighbor() even if we do not use the active set
+      // strategy since the function might have side effects; the compiler
+      // should remove it if it does not side effects
+      if (derived_activate_neighbor(v)) {
+        if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
+          __atomic_store_n(&_active[v], 1, __ATOMIC_RELAXED);
+        }
+      }
+    }
+  }
+
+  void match_isolated_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_isolated_nodes_impl<true>(from, to);
+  }
+
+  void cluster_isolated_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_isolated_nodes_impl<false>(from, to);
+  }
+
+  template <bool match>
+  void handle_isolated_nodes_impl(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    constexpr ClusterID kInvalidClusterID = std::numeric_limits<ClusterID>::max();
+    tbb::enumerable_thread_specific<ClusterID> current_cluster_ets(kInvalidClusterID);
+
+    tbb::parallel_for(
+        tbb::blocked_range<NodeID>(from, std::min(_graph->n(), to)),
+        [&](tbb::blocked_range<NodeID> r) {
+          ClusterID cluster = current_cluster_ets.local();
+
+          for (NodeID u = r.begin(); u != r.end(); ++u) {
+            if (_graph->degree(u) == 0) {
+              const ClusterID cu = derived_cluster(u);
+
+              if (cluster != kInvalidClusterID &&
+                  derived_move_cluster_weight(
+                      cu, cluster, derived_cluster_weight(cu), derived_max_cluster_weight(cluster)
+                  )) {
+                derived_move_node(u, cluster);
+                if constexpr (match) {
+                  cluster = kInvalidClusterID;
+                }
+              } else {
+                cluster = cu;
+              }
+            }
+          }
+
+          current_cluster_ets.local() = cluster;
+        }
+    );
+  }
+
+  void match_two_hop_nodes_threadwise(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_threadwise_impl<true>(from, to);
+  }
+
+  void cluster_two_hop_nodes_threadwise(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_threadwise_impl<false>(from, to);
+  }
+
+  template <bool match>
+  void handle_two_hop_nodes_threadwise_impl(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    static_assert(Config::kUseTwoHopClustering, "2-hop clustering is disabled");
+
+    tbb::enumerable_thread_specific<DynamicFlatMap<ClusterID, NodeID>> matching_map_ets;
+
+    auto is_considered_for_two_hop_clustering = [&](const NodeID u) {
+      // Skip nodes not considered for two-hop clustering
+      if (_graph->degree(u) == 0) {
+        // Not considered: isolated node
+        return false;
+      } else if (u != derived_cluster(u)) {
+        // Not considered: joined another cluster
+        return false;
+      } else {
+        // If u did not join another cluster, there could still be other nodes that joined this
+        // node's cluster: find out by checking the cluster weight
+        const ClusterWeight current_weight = derived_cluster_weight(u);
+        if (current_weight > derived_max_cluster_weight(u) / 2 ||
+            current_weight != derived_initial_cluster_weight(u)) {
+          // Not considered: not a singleton cluster; or its weight is too heavy
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    auto handle_node = [&](DynamicFlatMap<ClusterID, NodeID> &matching_map, const NodeID u) {
+      ClusterID &rep_key = matching_map[_favored_clusters[u]];
+
+      if (rep_key == 0) {
+        rep_key = u + 1;
+      } else {
+        const ClusterID rep = rep_key - 1;
+
+        const bool could_move_u_to_rep = derived_move_cluster_weight(
+            u, rep, derived_cluster_weight(u), derived_max_cluster_weight(rep)
+        );
+
+        if constexpr (match) {
+          KASSERT(could_move_u_to_rep);
+          derived_move_node(u, rep);
+          rep_key = 0;
+        } else {
+          if (could_move_u_to_rep) {
+            derived_move_node(u, rep);
+          } else {
+            rep_key = u + 1;
+          }
+        }
+      }
+    };
+
+    tbb::parallel_for(
+        tbb::blocked_range<NodeID>(from, std::min(to, _graph->n()), 512),
+        [&](const tbb::blocked_range<NodeID> &r) {
+          auto &matching_map = matching_map_ets.local();
+
+          for (NodeID u = r.begin(); u != r.end(); ++u) {
+            if (is_considered_for_two_hop_clustering(u)) {
+              handle_node(matching_map, u);
+            }
+          }
+        }
+    );
+  }
+
+  void match_two_hop_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_impl<true>(from, to);
+  }
+
+  void cluster_two_hop_nodes(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    handle_two_hop_nodes_impl<false>(from, to);
+  }
+
+  template <bool match>
+  void handle_two_hop_nodes_impl(
+      const NodeID from = 0, const NodeID to = std::numeric_limits<ClusterID>::max()
+  ) {
+    static_assert(Config::kUseTwoHopClustering, "2-hop clustering is disabled");
+
+    auto is_considered_for_two_hop_clustering = [&](const NodeID u) {
+      // Skip nodes not considered for two-hop clustering
+      if (_graph->degree(u) == 0) {
+        // Not considered: isolated node
+        return false;
+      } else if (u != derived_cluster(u)) {
+        // Not considered: joined another cluster
+        return false;
+      } else {
+        // If u did not join another cluster, there could still be other nodes that joined this
+        // node's cluster: find out by checking the cluster weight
+        const ClusterWeight current_weight = derived_cluster_weight(u);
+        if (current_weight > derived_max_cluster_weight(u) / 2 ||
+            current_weight != derived_initial_cluster_weight(u)) {
+          // Not considered: not a singleton cluster; or its weight is too heavy
+          return false;
+        }
+      }
+
+      return true;
+    };
+
+    // There could be edge cases where the favorite cluster of a node is itself a singleton cluster
+    // (for instance, if a node joins another cluster during the first round, but moves out of the
+    // cluster in the next round)
+    // Since the following code is based on the ansumption that the favorite cluster of a node that
+    // is considered for two-hop clustering it itself not considere for two-hop clustering, we fix
+    // this situation by moving the nodes to their favorite cluster, if possible, here.
+    tbb::parallel_for(from, std::min(to, _graph->n()), [&](const NodeID u) {
+      if (is_considered_for_two_hop_clustering(u)) {
+        const NodeID cluster = _favored_clusters[u];
+        if (is_considered_for_two_hop_clustering(cluster) &&
+            derived_move_cluster_weight(
+                u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster)
+            )) {
+          derived_move_node(u, cluster);
+          --_current_num_clusters;
+        }
+      } else {
+        _favored_clusters[u] = u;
+      }
+    });
+
+    KASSERT(
+        [&] {
+          for (NodeID u = from; u < std::min(to, _graph->n()); ++u) {
+            if (_favored_clusters[u] >= _graph->n()) {
+              LOG_WARNING << "favored cluster of node " << u
+                          << " out of bounds: " << _favored_clusters[u] << " > " << _graph->n();
+            }
+            if (u != _favored_clusters[u] && is_considered_for_two_hop_clustering(u) &&
+                is_considered_for_two_hop_clustering(_favored_clusters[u])) {
+              LOG_WARNING << "node " << u << " (degree " << _graph->degree(u) << " )"
+                          << " is considered for two-hop clustering, but its favored cluster "
+                          << _favored_clusters[u] << " (degree "
+                          << _graph->degree(_favored_clusters[u])
+                          << ") is also considered for two-hop clustering";
+              return false;
+            }
+          }
+          return true;
+        }(),
+        "precondition for two-hop clustering violated: found favored clusters that could be joined",
+        assert::heavy
+    );
+
+    // During label propagation, we store the best cluster for each node in _favored_cluster[]
+    // regardless of whether there is enough space in the cluster for the node to join.
+    // We now use this information to merge nodes that could not join any cluster, i.e.,
+    // singleton-clusters by clustering or matching nodes that have favored cluster.
+
+    tbb::parallel_for(from, std::min(to, _graph->n()), [&](const NodeID u) {
+      if (should_stop()) {
+        return;
+      }
+
+      // Skip nodes not considered for two-hop clustering
+      if (!is_considered_for_two_hop_clustering(u)) {
+        return;
+      }
+
+      // Invariant:
+      // For each node u that is considered for two-hop clustering (i.e., nodes for which the
+      // following lines of code are executed), _favored_clusters[u] refers to node which *IS NOT*
+      // considered for two-hop matching.
+      //
+      // Reasoning:
+      // KASSERT()
+      //
+      // Conclusion:
+      // We can use _favored_clusters[u] to build the two-hop clusters.
+
+      const NodeID C = __atomic_load_n(&_favored_clusters[u], __ATOMIC_RELAXED);
+      auto &sync = _favored_clusters[C];
+
+      do {
+        NodeID cluster = sync;
+
+        if (cluster == C) {
+          if (__atomic_compare_exchange_n(
+                  &sync, &cluster, u, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST
+              )) {
+            // We are done: other nodes will join our cluster
+            break;
+          }
+          if (cluster == C) {
+            continue;
+          }
+        }
+
+        // Invariant: cluster is a node with favored cluster C
+        KASSERT(
+            __atomic_load_n(&_favored_clusters[cluster], __ATOMIC_RELAXED) == C,
+            "invariant violated by: " << V(u) << V(cluster) << V(C) << V(_favored_clusters[C])
+        );
+
+        // Try to join the cluster:
+        if constexpr (match) {
+          // Matching mode: try to build a cluster only containing nodes "cluster" and "u"
+          if (__atomic_compare_exchange_n(
+                  &sync, &cluster, C, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST
+              )) {
+            [[maybe_unused]] const bool success = derived_move_cluster_weight(
+                u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster)
+            );
+            KASSERT(
+                success,
+                "node " << u << " could be matched with node " << cluster << ": "
+                        << derived_cluster_weight(u) << " + " << derived_cluster_weight(cluster)
+                        << " > " << derived_max_cluster_weight(cluster)
+            );
+
+            derived_move_node(u, cluster);
+
+            // We are done: build a cluster with "cluster", reset "sync" to C
+            break;
+          }
+        } else {
+          // Clustering mode: try to join cluster "cluster" if the weight constraint permits it,
+          // otherwise try to start a new cluster
+          if (derived_move_cluster_weight(
+                  u, cluster, derived_cluster_weight(u), derived_max_cluster_weight(cluster)
+              )) {
+            derived_move_node(u, cluster);
+
+            // We are done: joined cluster "cluster"
+            break;
+          } else if (__atomic_compare_exchange_n(
+                         &sync, &cluster, C, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST
+                     )) {
+            // We are done: start a new cluster
+            break;
+          }
+        }
+      } while (true);
+    });
+  }
+
+private:
+  void reset_state() {
+    tbb::parallel_invoke(
+        [&] {
+          tbb::parallel_for<NodeID>(0, _graph->n(), [&](const NodeID u) {
+            if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
+              _active[u] = 1;
+            }
+
+            const ClusterID initial_cluster = derived_initial_cluster(u);
+            derived_init_cluster(u, initial_cluster);
+            if constexpr (Config::kUseTwoHopClustering) {
+              _favored_clusters[u] = initial_cluster;
+            }
+
+            derived_reset_node_state(u);
+          });
+        },
+        [&] {
+          tbb::parallel_for<ClusterID>(0, _initial_num_clusters, [&](const ClusterID cluster) {
+            derived_init_cluster_weight(cluster, derived_initial_cluster_weight(cluster));
+          });
+        }
+    );
+    IFSTATS(_expected_total_gain = 0);
+    _current_num_clusters = _initial_num_clusters;
+  }
+
+private: // CRTP calls
+  //! Return current cluster ID of  node \c u.
+  [[nodiscard]] ClusterID derived_cluster(const NodeID u) {
+    return static_cast<Derived *>(this)->cluster(u);
+  }
+
+  //! Initially place \c u in cluster \cluster.
+  void derived_init_cluster(const NodeID u, const ClusterID cluster) {
+    static_cast<Derived *>(this)->init_cluster(u, cluster);
+  }
+
+  //! Change cluster of node \c u to \c cluster.
+  void derived_move_node(const NodeID u, const ClusterID cluster) {
+    static_cast<Derived *>(this)->move_node(u, cluster);
+  }
+
+  //! Return current weight of cluster \c cluster.
+  [[nodiscard]] ClusterWeight derived_cluster_weight(const ClusterID cluster) {
+    return static_cast<Derived *>(this)->cluster_weight(cluster);
+  }
+
+  //! Initially set weight of cluster \cluster to \c weight.
+  void derived_init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) {
+    static_cast<Derived *>(this)->init_cluster_weight(cluster, weight);
+  }
+
+  //! Attempt to move \c delta weight from cluster \c old_cluster to \c
+  //! new_cluster, which can take at most \c max_weight weight.
+  [[nodiscard]] bool derived_move_cluster_weight(
+      const ClusterID old_cluster,
+      const ClusterID new_cluster,
+      const ClusterWeight delta,
+      const ClusterWeight max_weight
+  ) {
+    return static_cast<Derived *>(this)->move_cluster_weight(
+        old_cluster, new_cluster, delta, max_weight
+    );
+  }
+
+  //! Return the maximum weight of cluster \c cluster.
+  [[nodiscard]] ClusterWeight derived_max_cluster_weight(const ClusterID cluster) {
+    return static_cast<Derived *>(this)->max_cluster_weight(cluster);
+  }
+
+  //! Determine whether a node should be moved to a new cluster.
+  [[nodiscard]] bool derived_accept_cluster(const ClusterSelectionState &state) {
+    return static_cast<Derived *>(this)->accept_cluster(state);
+  }
+
+  void derived_reset_node_state(const NodeID u) {
+    static_cast<Derived *>(this)->reset_node_state(u);
+  }
+
+  [[nodiscard]] inline bool derived_accept_neighbor(const NodeID u, const NodeID v) {
+    return static_cast<Derived *>(this)->accept_neighbor(u, v);
+  }
+
+  [[nodiscard]] inline bool derived_activate_neighbor(const NodeID u) {
+    return static_cast<Derived *>(this)->activate_neighbor(u);
+  }
+
+  [[nodiscard]] ClusterID derived_initial_cluster(const NodeID u) {
+    return static_cast<Derived *>(this)->initial_cluster(u);
+  }
+
+  [[nodiscard]] ClusterWeight derived_initial_cluster_weight(const ClusterID cluster) {
+    return static_cast<Derived *>(this)->initial_cluster_weight(cluster);
+  }
+
+  [[nodiscard]] bool derived_skip_node(const NodeID node) {
+    return static_cast<Derived *>(this)->skip_node(node);
+  }
+
+protected: // Default implementations
+  void reset_node_state(const NodeID /* node */) {}
+
+  [[nodiscard]] inline bool accept_neighbor(const NodeID /* u */, const NodeID /* v */) {
+    return true;
+  }
+
+  [[nodiscard]] inline bool activate_neighbor(const NodeID /* node */) {
+    return true;
+  }
+
+  [[nodiscard]] inline ClusterID initial_cluster(const NodeID u) {
+    return derived_cluster(u);
+  }
+
+  [[nodiscard]] inline ClusterWeight initial_cluster_weight(const ClusterID cluster) {
+    return derived_cluster_weight(cluster);
+  }
+
+  [[nodiscard]] inline bool skip_node(const NodeID /* node */) {
+    return false;
+  }
+
+protected: // Members
+  //! CSRGraph we operate on, or \c nullptr if \c initialize has not been called
+  //! yet.
+  const CSRGraph *_graph = nullptr;
+
+  //! The number of non-empty clusters before we ran the first iteration of
+  //! label propagation.
+  ClusterID _initial_num_clusters;
+
+  //! The current number of non-empty clusters. Only meaningful if empty
+  //! clusters are being counted.
+  parallel::Atomic<ClusterID> _current_num_clusters;
+
+  //! We stop label propagation if the number of non-empty clusters falls below
+  //! this threshold. Only has an effect if empty clusters are being counted.
+  ClusterID _desired_num_clusters = 0;
+
+  //! We do not move nodes with a degree higher than this. However, other nodes
+  //! may still be moved to the cluster of with degree larger than this
+  //! threshold.
+  NodeID _max_degree = std::numeric_limits<NodeID>::max();
+
+  //! When computing the gain values for a node, this is an upper limit on the
+  //! number of neighbors of the nodes we consider. Any more neighbors are
+  //! ignored.
+  NodeID _max_num_neighbors = std::numeric_limits<NodeID>::max();
+
+  //! Thread-local map to compute gain values.
+  tbb::enumerable_thread_specific<RatingMap> _rating_map_ets{[this] {
+    return RatingMap(_num_clusters);
+  }};
+
+  //! Flags nodes with at least one node in its neighborhood that changed
+  //! clusters during the last iteration. Nodes without this flag set must not
+  //! be considered in the next iteration.
+  StaticArray<std::uint8_t> _active;
+
+  //! If a node cannot join any cluster during an iteration, this vector stores
+  //! the node's highest rated cluster independent of the maximum cluster
+  //! weight. This information is used during 2-hop clustering.
+  StaticArray<ClusterID> _favored_clusters;
+
+  //! If statistics are enabled, this is the sum of the gain of all moves that
+  //! were performed. If executed single-thread, this should be equal to the
+  //! reduction of the edge cut.
+  parallel::Atomic<EdgeWeight> _expected_total_gain;
+
+private:
+  NodeID _num_nodes = 0;
+  NodeID _num_active_nodes = 0;
+  ClusterID _num_clusters = 0;
+};
+
+/*!
+ * Parallel label propagation template that iterates over nodes in their natural
+ * order.
+ * @tparam Derived Derived subclass for static polymorphism.
+ * @tparam Config Algorithmic configuration and data types.
+ */
+template <typename Derived, typename Config>
+class InOrderLegacyLabelPropagation : public LegacyLabelPropagation<Derived, Config> {
+  static_assert(std::is_base_of_v<LegacyLabelPropagationConfig, Config>);
+  SET_DEBUG(true);
+
+protected:
+  using Base = LegacyLabelPropagation<Derived, Config>;
+
+  using CSRGraph = typename Base::CSRGraph;
+  using ClusterID = typename Base::ClusterID;
+  using ClusterWeight = typename Base::ClusterWeight;
+  using EdgeID = typename Base::EdgeID;
+  using EdgeWeight = typename Base::EdgeWeight;
+  using NodeID = typename Base::NodeID;
+  using NodeWeight = typename Base::NodeWeight;
+
+  using Base::handle_node;
+  using Base::set_max_degree;
+  using Base::set_max_num_neighbors;
+  using Base::should_stop;
+
+  NodeID
+  perform_iteration(const NodeID from = 0, const NodeID to = std::numeric_limits<NodeID>::max()) {
+    tbb::enumerable_thread_specific<NodeID> num_moved_nodes_ets;
+
+    tbb::parallel_for(
+        tbb::blocked_range<NodeID>(from, std::min(_graph->n(), to)),
+        [&](const auto &r) {
+          EdgeID work_since_update = 0;
+          NodeID num_removed_clusters = 0;
+
+          auto &num_moved_nodes = num_moved_nodes_ets.local();
+          auto &rand = Random::instance();
+          auto &rating_map = _rating_map_ets.local();
+
+          for (NodeID u = r.begin(); u != r.end(); ++u) {
+            if (_graph->degree(u) > _max_degree) {
+              continue;
+            }
+
+            if constexpr (Config::kUseActiveSetStrategy || Config::kUseLocalActiveSetStrategy) {
+              if (!__atomic_load_n(&_active[u], __ATOMIC_RELAXED)) {
+                continue;
+              }
+            }
+
+            if (work_since_update > Config::kMinChunkSize) {
+              if (Base::should_stop()) {
+                return;
+              }
+
+              _current_num_clusters -= num_removed_clusters;
+              work_since_update = 0;
+              num_removed_clusters = 0;
+            }
+
+            const auto [moved_node, emptied_cluster] = handle_node(u, rand, rating_map);
+            work_since_update += _graph->degree(u);
+            if (moved_node) {
+              ++num_moved_nodes;
+            }
+            if (emptied_cluster) {
+              ++num_removed_clusters;
+            }
+          }
+        }
+    );
+
+    return num_moved_nodes_ets.combine(std::plus{});
+  }
+
+  using Base::_active;
+  using Base::_current_num_clusters;
+  using Base::_graph;
+  using Base::_max_degree;
+  using Base::_rating_map_ets;
+};
+
+/*!
+ * Parallel label propagation template that iterates over nodes in chunk random
+ * order.
+ * @tparam Derived Derived subclass for static polymorphism.
+ * @tparam Config Algorithmic configuration and data types.
+ */
+template <typename Derived, typename Config>
+class ChunkRandomdLegacyLabelPropagation : public LegacyLabelPropagation<Derived, Config> {
+  using Base = LegacyLabelPropagation<Derived, Config>;
+  static_assert(std::is_base_of_v<LegacyLabelPropagationConfig, Config>);
+
+  SET_DEBUG(false);
+
+protected:
+  using CSRGraph = typename Base::CSRGraph;
+  using ClusterID = typename Base::ClusterID;
+  using ClusterWeight = typename Base::ClusterWeight;
+  using EdgeID = typename Base::EdgeID;
+  using EdgeWeight = typename Base::EdgeWeight;
+  using NodeID = typename Base::NodeID;
+  using NodeWeight = typename Base::NodeWeight;
+
+  using Base::handle_node;
+  using Base::set_max_degree;
+  using Base::set_max_num_neighbors;
+  using Base::should_stop;
+
+  void initialize(const CSRGraph *graph, const ClusterID num_clusters) {
+    Base::initialize(graph, num_clusters);
+    _chunks.clear();
+    _buckets.clear();
+  }
+
+  /**
+   * Performs label propagation on local nodes in range [from, to) in
+   * chunk-randomized order.
+   *
+   * The randomization works in multiple steps:
+   * - Nodes within the iteration order are split into chunks of consecutive
+   * nodes. The size of each chunk is determined by
+   * LegacyLabelPropagationConfig::kMinChunkSize, which is a lower bound on the sum of
+   * the degrees assigned to a chunk (nodes are assigned to a chunk until the
+   * limit is exceeded).
+   * - Afterwards, the order of chunk is shuffled.
+   * - Finally, chunks are processed in parallel. To this end, the nodes
+   * assigned to a chunk are once more split into sub-chunks, which are then
+   * processed sequentially and in-order; however, within a sub-chunk, nodes are
+   * once more shuffled.
+   * - If available, degree buckets are respected: chunks of smaller buckets are
+   * processed before chunks of larger buckets.
+   *
+   * @param from First node in the iteration range.
+   * @param to First node that is not part of the iteration range.
+   * @return Number of nodes that where moved to new blocks / clusters.
+   */
+  NodeID
+  perform_iteration(const NodeID from = 0, const NodeID to = std::numeric_limits<NodeID>::max()) {
+    if (from != 0 || to != std::numeric_limits<NodeID>::max()) {
+      _chunks.clear();
+    }
+    if (_chunks.empty()) {
+      init_chunks(from, to);
+    }
+    shuffle_chunks();
+
+    tbb::enumerable_thread_specific<NodeID> num_moved_nodes_ets;
+    parallel::Atomic<std::size_t> next_chunk = 0;
+
+    tbb::parallel_for(static_cast<std::size_t>(0), _chunks.size(), [&](const std::size_t) {
+      if (should_stop()) {
+        return;
+      }
+
+      auto &local_num_moved_nodes = num_moved_nodes_ets.local();
+      auto &local_rand = Random::instance();
+      auto &local_rating_map = _rating_map_ets.local();
+      NodeID num_removed_clusters = 0;
+
+      const auto chunk_id = next_chunk.fetch_add(1, std::memory_order_relaxed);
+      const auto &chunk = _chunks[chunk_id];
+      const auto &permutation = _random_permutations.get(local_rand);
+
+      const std::size_t num_sub_chunks =
+          std::ceil(1.0 * (chunk.end - chunk.start) / Config::kPermutationSize);
+      std::vector<NodeID> sub_chunk_permutation(num_sub_chunks);
+      std::iota(sub_chunk_permutation.begin(), sub_chunk_permutation.end(), 0);
+      local_rand.shuffle(sub_chunk_permutation);
+
+      for (std::size_t sub_chunk = 0; sub_chunk < num_sub_chunks; ++sub_chunk) {
+        for (std::size_t i = 0; i < Config::kPermutationSize; ++i) {
+          const NodeID u = chunk.start +
+                           Config::kPermutationSize * sub_chunk_permutation[sub_chunk] +
+                           permutation[i % Config::kPermutationSize];
+          if (u < chunk.end && _graph->degree(u) < _max_degree &&
+              ((!Config::kUseActiveSetStrategy && !Config::kUseLocalActiveSetStrategy) ||
+               __atomic_load_n(&_active[u], __ATOMIC_RELAXED))) {
+            const auto [moved_node, emptied_cluster] = handle_node(u, local_rand, local_rating_map);
+            if (moved_node) {
+              ++local_num_moved_nodes;
+            }
+            if (emptied_cluster) {
+              ++num_removed_clusters;
+            }
+          }
+        }
+      }
+
+      _current_num_clusters -= num_removed_clusters;
+    });
+
+    return num_moved_nodes_ets.combine(std::plus{});
+  }
+
+private:
+  struct Chunk {
+    NodeID start;
+    NodeID end;
+  };
+
+  struct Bucket {
+    std::size_t start;
+    std::size_t end;
+  };
+
+  void shuffle_chunks() {
+    tbb::parallel_for<std::size_t>(0, _buckets.size(), [&](const std::size_t i) {
+      const auto &bucket = _buckets[i];
+      Random::instance().shuffle(_chunks.begin() + bucket.start, _chunks.begin() + bucket.end);
+    });
+  }
+
+  void init_chunks(const NodeID from, NodeID to) {
+    _chunks.clear();
+    _buckets.clear();
+
+    to = std::min(to, _graph->n());
+
+    const auto max_bucket =
+        std::min<std::size_t>(math::floor_log2(_max_degree), _graph->number_of_buckets());
+    const EdgeID max_chunk_size = std::max<EdgeID>(Config::kMinChunkSize, std::sqrt(_graph->m()));
+    const NodeID max_node_chunk_size =
+        std::max<NodeID>(Config::kMinChunkSize, std::sqrt(_graph->n()));
+
+    NodeID position = 0;
+    for (std::size_t bucket = 0; bucket < max_bucket; ++bucket) {
+      if (position + _graph->bucket_size(bucket) < from || _graph->bucket_size(bucket) == 0) {
+        position += _graph->bucket_size(bucket);
+        continue;
+      }
+      if (position >= to) {
+        break;
+      }
+
+      NodeID remaining_bucket_size = _graph->bucket_size(bucket);
+      if (from > _graph->first_node_in_bucket(bucket)) {
+        remaining_bucket_size -= from - _graph->first_node_in_bucket(bucket);
+      }
+      const std::size_t bucket_size =
+          std::min<NodeID>({remaining_bucket_size, to - position, to - from});
+
+      parallel::Atomic<NodeID> offset = 0;
+      tbb::enumerable_thread_specific<std::size_t> num_chunks_ets;
+      tbb::enumerable_thread_specific<std::vector<Chunk>> chunks_ets;
+
+      const std::size_t bucket_start = std::max(_graph->first_node_in_bucket(bucket), from);
+
+      tbb::parallel_for(
+          static_cast<int>(0),
+          tbb::this_task_arena::max_concurrency(),
+          [&](const int) {
+            auto &chunks = chunks_ets.local();
+            auto &num_chunks = num_chunks_ets.local();
+
+            while (offset < bucket_size) {
+              const NodeID begin = offset.fetch_add(max_node_chunk_size);
+              if (begin >= bucket_size) {
+                break;
+              }
+              const NodeID end = std::min<NodeID>(begin + max_node_chunk_size, bucket_size);
+
+              EdgeID current_chunk_size = 0;
+              NodeID chunk_start = bucket_start + begin;
+
+              for (NodeID i = begin; i < end; ++i) {
+                const NodeID u = bucket_start + i;
+                current_chunk_size += _graph->degree(u);
+                if (current_chunk_size >= max_chunk_size) {
+                  chunks.push_back({chunk_start, u + 1});
+                  chunk_start = u + 1;
+                  current_chunk_size = 0;
+                  ++num_chunks;
+                }
+              }
+
+              if (current_chunk_size > 0) {
+                chunks.push_back(
+                    {static_cast<NodeID>(chunk_start), static_cast<NodeID>(bucket_start + end)}
+                );
+                ++num_chunks;
+              }
+            }
+          }
+      );
+
+      const std::size_t num_chunks = num_chunks_ets.combine(std::plus{});
+
+      const std::size_t chunks_start = _chunks.size();
+      parallel::Atomic<std::size_t> pos = chunks_start;
+      _chunks.resize(chunks_start + num_chunks);
+      tbb::parallel_for(chunks_ets.range(), [&](auto &r) {
+        for (auto &chunk : r) {
+          const std::size_t local_pos = pos.fetch_add(chunk.size());
+          std::copy(chunk.begin(), chunk.end(), _chunks.begin() + local_pos);
+        }
+      });
+
+      _buckets.push_back({chunks_start, _chunks.size()});
+
+      position += _graph->bucket_size(bucket);
+    }
+
+    // Make sure that we cover all nodes in [from, to)
+    KASSERT(
+        [&] {
+          std::vector<bool> hit(to - from);
+          for (const auto &[start, end] : _chunks) {
+            KASSERT(start <= end, "");
+            EdgeWeight total_work = 0;
+
+            for (NodeID u = start; u < end; ++u) {
+              KASSERT(from <= u, "");
+              KASSERT(u < to, "");
+              KASSERT(!hit[u - from], "");
+
+              hit[u - from] = true;
+              total_work += _graph->degree(u);
+            }
+          }
+
+          for (NodeID u = 0; u < to - from; ++u) {
+            KASSERT(
+                _graph->degree(u) == 0u || hit[u],
+                V(_graph->degree(u)) << V(from) << V(u + from) << V(to)
+            );
+          }
+
+          return true;
+        }(),
+        "",
+        assert::heavy
+    );
+  }
+
+protected:
+  using Base::_active;
+  using Base::_current_num_clusters;
+  using Base::_graph;
+  using Base::_max_degree;
+  using Base::_rating_map_ets;
+
+  RandomPermutations<NodeID, Config::kPermutationSize, Config::kNumberOfNodePermutations>
+      _random_permutations{};
+  std::vector<Chunk> _chunks;
+  std::vector<Bucket> _buckets;
+};
+
+template <typename ClusterID, typename ClusterWeight> class LegacyOwnedRelaxedClusterWeightVector {
+public:
+  void allocate_cluster_weights(const ClusterID num_clusters) {
+    if (_cluster_weights.size() < num_clusters) {
+      _cluster_weights.resize(num_clusters);
+    }
+  }
+
+  auto &&take_cluster_weights() {
+    return std::move(_cluster_weights);
+  }
+
+  void init_cluster_weight(const ClusterID cluster, const ClusterWeight weight) {
+    _cluster_weights[cluster] = weight;
+  }
+
+  ClusterWeight cluster_weight(const ClusterID cluster) {
+    return __atomic_load_n(&_cluster_weights[cluster], __ATOMIC_RELAXED);
+  }
+
+  bool move_cluster_weight(
+      const ClusterID old_cluster,
+      const ClusterID new_cluster,
+      const ClusterWeight delta,
+      const ClusterWeight max_weight
+  ) {
+    if (_cluster_weights[new_cluster] + delta <= max_weight) {
+      __atomic_fetch_add(&_cluster_weights[new_cluster], delta, __ATOMIC_RELAXED);
+      __atomic_fetch_sub(&_cluster_weights[old_cluster], delta, __ATOMIC_RELAXED);
+      return true;
+    }
+    return false;
+  }
+
+private:
+  StaticArray<ClusterWeight> _cluster_weights;
+};
+
+template <typename NodeID, typename ClusterID> class LegacyNonatomicClusterVectorRef {
+public:
+  void init_clusters_ref(StaticArray<NodeID> &clustering) {
+    _clusters = &clustering;
+  }
+
+  void init_cluster(const NodeID node, const ClusterID cluster) {
+    move_node(node, cluster);
+  }
+
+  [[nodiscard]] ClusterID cluster(const NodeID node) {
+    KASSERT(node < _clusters->size());
+    return __atomic_load_n(&_clusters->at(node), __ATOMIC_RELAXED);
+  }
+
+  void move_node(const NodeID node, const ClusterID cluster) {
+    KASSERT(node < _clusters->size());
+    __atomic_store_n(&_clusters->at(node), cluster, __ATOMIC_RELAXED);
+  }
+
+private:
+  StaticArray<ClusterID> *_clusters = nullptr;
+};
+} // namespace kaminpar
diff --git a/kaminpar-shm/metrics.h b/kaminpar-shm/metrics.h
index 2a57e256..db54744a 100644
--- a/kaminpar-shm/metrics.h
+++ b/kaminpar-shm/metrics.h
@@ -16,60 +16,51 @@
 #include "kaminpar-shm/kaminpar.h"
 
 #include "kaminpar-common/assert.h"
+#include "kaminpar-common/asserting_cast.h"
 
 namespace kaminpar::shm::metrics {
-template <typename PartitionedGraphType> EdgeWeight edge_cut(const PartitionedGraphType &p_graph) {
+template <typename PartitionedGraph, typename Graph>
+EdgeWeight edge_cut(const PartitionedGraph &p_graph, const Graph &graph) {
   tbb::enumerable_thread_specific<int64_t> cut_ets;
 
-  tbb::parallel_for(
-      tbb::blocked_range<NodeID>(0, p_graph.n()),
-      [&](const tbb::blocked_range<NodeID> &r) {
-        auto &cut = cut_ets.local();
-
-        for (NodeID u = r.begin(); u < r.end(); ++u) {
-          for (const auto &[e, v] : p_graph.neighbors(u)) {
-            cut += (p_graph.block(u) != p_graph.block(v)) ? p_graph.edge_weight(e) : 0;
-          }
-        }
-      }
-  );
+  tbb::parallel_for(tbb::blocked_range<NodeID>(0, graph.n()), [&](const auto &r) {
+    auto &cut = cut_ets.local();
+    for (NodeID u = r.begin(); u < r.end(); ++u) {
+      graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
+        cut += (p_graph.block(u) != p_graph.block(v)) ? graph.edge_weight(e) : 0;
+      });
+    }
+  });
 
   std::int64_t cut = cut_ets.combine(std::plus<>{});
-  KASSERT(cut % 2 == 0u);
-  cut /= 2;
 
-  KASSERT(
-      0 <= cut && cut <= std::numeric_limits<EdgeWeight>::max(),
-      "edge cut overflows: " << cut,
-      assert::always
-  );
+  KASSERT(cut % 2 == 0u, "inconsistent cut", assert::always);
+  return asserting_cast<assert::always, EdgeWeight>(cut / 2);
+}
 
-  return static_cast<EdgeWeight>(cut);
+template <typename PartitionedGraph> EdgeWeight edge_cut(const PartitionedGraph &p_graph) {
+  return p_graph.reified([&](const auto &graph) { return edge_cut(p_graph, graph); });
 }
 
-template <typename PartitionedGraphType>
-EdgeWeight edge_cut_seq(const PartitionedGraphType &p_graph) {
+template <typename PartitionedGraph, typename Graph>
+EdgeWeight edge_cut_seq(const PartitionedGraph &p_graph, const Graph &graph) {
   std::int64_t cut = 0;
 
-  for (const NodeID u : p_graph.nodes()) {
-    for (const auto &[e, v] : p_graph.neighbors(u)) {
-      cut += (p_graph.block(u) != p_graph.block(v)) ? p_graph.edge_weight(e) : 0;
-    }
+  for (const NodeID u : graph.nodes()) {
+    graph.neighbors(u, [&](const EdgeID e, const NodeID v) {
+      cut += (p_graph.block(u) != p_graph.block(v)) ? graph.edge_weight(e) : 0;
+    });
   }
 
-  KASSERT(cut % 2 == 0u);
-  cut /= 2;
-
-  KASSERT(
-      0 <= cut && cut <= std::numeric_limits<EdgeWeight>::max(),
-      "edge cut overflows: " << cut,
-      assert::always
-  );
+  KASSERT(cut % 2 == 0u, "inconsistent cut", assert::always);
+  return asserting_cast<assert::always, EdgeWeight>(cut / 2);
+}
 
-  return static_cast<EdgeWeight>(cut);
+template <typename PartitionedGraph> EdgeWeight edge_cut_seq(const PartitionedGraph &p_graph) {
+  return p_graph.reified([&](const auto &graph) { return edge_cut_seq(p_graph, graph); });
 }
 
-template <typename PartitionedGraphType> double imbalance(const PartitionedGraphType &p_graph) {
+template <typename PartitionedGraph> double imbalance(const PartitionedGraph &p_graph) {
   const double perfect_block_weight = std::ceil(1.0 * p_graph.total_node_weight() / p_graph.k());
 
   double max_imbalance = 0.0;
@@ -80,9 +71,10 @@ template <typename PartitionedGraphType> double imbalance(const PartitionedGraph
   return max_imbalance;
 }
 
-template <typename PartitionedGraphType>
-NodeWeight total_overload(const PartitionedGraphType &p_graph, const PartitionContext &context) {
+template <typename PartitionedGraph>
+NodeWeight total_overload(const PartitionedGraph &p_graph, const PartitionContext &context) {
   NodeWeight total_overload = 0;
+
   for (const BlockID b : p_graph.blocks()) {
     total_overload +=
         std::max<BlockWeight>(0, p_graph.block_weight(b) - context.block_weights.max(b));
@@ -91,33 +83,25 @@ NodeWeight total_overload(const PartitionedGraphType &p_graph, const PartitionCo
   return total_overload;
 }
 
-template <typename PartitionedGraphType>
-bool is_balanced(const PartitionedGraphType &p_graph, const PartitionContext &p_ctx) {
-  return std::all_of(
-      p_graph.blocks().begin(),
-      p_graph.blocks().end(),
-      [&p_graph, &p_ctx](const BlockID b) {
-        return p_graph.block_weight(b) <= p_ctx.block_weights.max(b);
-      }
-  );
+template <typename PartitionedGraph>
+bool is_balanced(const PartitionedGraph &p_graph, const PartitionContext &p_ctx) {
+  return std::all_of(p_graph.blocks().begin(), p_graph.blocks().end(), [&](const BlockID b) {
+    return p_graph.block_weight(b) <= p_ctx.block_weights.max(b);
+  });
 }
 
-template <typename PartitionedGraphType>
-bool is_feasible(const PartitionedGraphType &p_graph, const BlockID input_k, const double eps) {
+template <typename PartitionedGraph>
+bool is_feasible(const PartitionedGraph &p_graph, const BlockID input_k, const double eps) {
   const double max_block_weight = std::ceil((1.0 + eps) * p_graph.total_node_weight() / input_k);
 
-  return std::all_of(
-      p_graph.blocks().begin(),
-      p_graph.blocks().end(),
-      [&p_graph, input_k, max_block_weight](const BlockID b) {
-        const BlockID final_kb = compute_final_k(b, p_graph.k(), input_k);
-        return p_graph.block_weight(b) <= max_block_weight * final_kb + p_graph.max_node_weight();
-      }
-  );
+  return std::all_of(p_graph.blocks().begin(), p_graph.blocks().end(), [&](const BlockID b) {
+    const BlockID final_kb = compute_final_k(b, p_graph.k(), input_k);
+    return p_graph.block_weight(b) <= max_block_weight * final_kb + p_graph.max_node_weight();
+  });
 }
 
-template <typename PartitionedGraphType>
-bool is_feasible(const PartitionedGraphType &p_graph, const PartitionContext &p_ctx) {
+template <typename PartitionedGraph>
+bool is_feasible(const PartitionedGraph &p_graph, const PartitionContext &p_ctx) {
   return is_balanced(p_graph, p_ctx);
 }
 } // namespace kaminpar::shm::metrics
diff --git a/kaminpar-shm/partition_utils.cc b/kaminpar-shm/partition_utils.cc
index 002258b2..f4a01041 100644
--- a/kaminpar-shm/partition_utils.cc
+++ b/kaminpar-shm/partition_utils.cc
@@ -8,8 +8,8 @@
 #include "kaminpar-shm/partition_utils.h"
 
 #include <array>
+#include <cmath>
 
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/kaminpar.h"
 
 #include "kaminpar-common/math.h"
@@ -27,7 +27,10 @@ double compute_2way_adaptive_epsilon(
 }
 
 PartitionContext create_bipartition_context(
-    const Graph &subgraph, const BlockID k1, const BlockID k2, const PartitionContext &kway_p_ctx
+    const AbstractGraph &subgraph,
+    const BlockID k1,
+    const BlockID k2,
+    const PartitionContext &kway_p_ctx
 ) {
   PartitionContext twoway_p_ctx;
   twoway_p_ctx.k = 2;
@@ -45,10 +48,12 @@ BlockID compute_final_k(const BlockID block, const BlockID current_k, const Bloc
 
   // The level of the current block in the binary tree == log2(current_k)
   const BlockID level = math::floor_log2(current_k);
+
   // Within a level, each pair of labels l1, l2 satisfy |l1 - l2| <= 1, i.e., they differ by at most
   // one.
   // This is the smaller label of the level, i.e., the label is either base or base + 1.
   const BlockID base = input_k >> level;
+
   // This is the number of base + 1 labels of the level, all other have value base:
   const BlockID num_plus_one_blocks = input_k & ((1 << level) - 1);
 
diff --git a/kaminpar-shm/partition_utils.h b/kaminpar-shm/partition_utils.h
index 5dc33bee..3bc753c7 100644
--- a/kaminpar-shm/partition_utils.h
+++ b/kaminpar-shm/partition_utils.h
@@ -7,62 +7,16 @@
  ******************************************************************************/
 #pragma once
 
-#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/abstract_graph.h"
 #include "kaminpar-shm/kaminpar.h"
 
-#include "kaminpar-common/math.h"
-
 namespace kaminpar::shm {
-
-template <
-    typename CoarseningContext,
-    typename PartitionContext,
-    typename NodeID,
-    typename NodeWeight>
-NodeWeight compute_max_cluster_weight(
-    const CoarseningContext &c_ctx,
-    const NodeID n,
-    const NodeWeight total_node_weight,
-    const PartitionContext &p_ctx
-) {
-
-  double max_cluster_weight = 0.0;
-
-  switch (c_ctx.cluster_weight_limit) {
-  case ClusterWeightLimit::EPSILON_BLOCK_WEIGHT:
-    max_cluster_weight = (p_ctx.epsilon * total_node_weight) /
-                         std::clamp<BlockID>(n / c_ctx.contraction_limit, 2, p_ctx.k);
-    break;
-
-  case ClusterWeightLimit::BLOCK_WEIGHT:
-    max_cluster_weight = (1.0 + p_ctx.epsilon) * total_node_weight / p_ctx.k;
-    break;
-
-  case ClusterWeightLimit::ONE:
-    max_cluster_weight = 1.0;
-    break;
-
-  case ClusterWeightLimit::ZERO:
-    max_cluster_weight = 0.0;
-    break;
-  }
-
-  return static_cast<NodeWeight>(max_cluster_weight * c_ctx.cluster_weight_multiplier);
-}
-
-template <typename CoarseningContext>
-NodeWeight compute_max_cluster_weight(
-    const CoarseningContext &c_ctx, const Graph &graph, const PartitionContext &p_ctx
-) {
-  return compute_max_cluster_weight(c_ctx, graph.n(), graph.total_node_weight(), p_ctx);
-}
-
 double compute_2way_adaptive_epsilon(
     NodeWeight total_node_weight, BlockID k, const PartitionContext &p_ctx
 );
 
 PartitionContext create_bipartition_context(
-    const Graph &subgraph, BlockID k1, BlockID k2, const PartitionContext &kway_p_ctx
+    const AbstractGraph &subgraph, BlockID k1, BlockID k2, const PartitionContext &kway_p_ctx
 );
 
 /**
@@ -84,7 +38,7 @@ PartitionContext create_bipartition_context(
  * @param block The block $B$ / the position of a node within its level.
  * @param current_k The number of blocks $k'$ in the intermediate partition / the size of the node's
  * level.
- * @param inpuot_k The number of blocks $k$ in the final partition / the label of the root node.
+ * @param input_k The number of blocks $k$ in the final partition / the label of the root node.
  *
  * @return The number of blocks into which $B$ will be split for the final partition.
  */
diff --git a/kaminpar-shm/partitioning/deep/async_initial_partitioning.cc b/kaminpar-shm/partitioning/deep/async_initial_partitioning.cc
index 4c6e99af..1dc55d80 100644
--- a/kaminpar-shm/partitioning/deep/async_initial_partitioning.cc
+++ b/kaminpar-shm/partitioning/deep/async_initial_partitioning.cc
@@ -8,7 +8,13 @@
  ******************************************************************************/
 #include "kaminpar-shm/partitioning/deep/async_initial_partitioning.h"
 
+#include "kaminpar-shm/factories.h"
+
 namespace kaminpar::shm::partitioning {
+namespace {
+SET_DEBUG(true);
+}
+
 AsyncInitialPartitioner::AsyncInitialPartitioner(
     const Context &input_ctx,
     GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool,
@@ -27,7 +33,7 @@ AsyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCo
 PartitionedGraph AsyncInitialPartitioner::partition_recursive(
     const Coarsener *parent_coarsener, PartitionContext &p_ctx, const std::size_t num_threads
 ) {
-  const Graph *graph = parent_coarsener->coarsest_graph();
+  const Graph *graph = &parent_coarsener->current();
 
   // Base case: only one thread left <=> compute bipartition
   if (num_threads == 1) {
@@ -35,10 +41,13 @@ PartitionedGraph AsyncInitialPartitioner::partition_recursive(
   }
 
   // Otherwise, coarsen further and proceed recursively
-  auto coarsener = factory::create_coarsener(*graph, _input_ctx.coarsening);
-  const bool shrunk = helper::coarsen_once(coarsener.get(), graph, _input_ctx, p_ctx);
+  auto coarsener = factory::create_coarsener(_input_ctx);
+  coarsener->initialize(graph);
+
+  const bool shrunk = helper::coarsen_once(coarsener.get(), graph, p_ctx);
   PartitionedGraph p_graph = split_and_join(coarsener.get(), p_ctx, !shrunk, num_threads);
-  p_graph = helper::uncoarsen_once(coarsener.get(), std::move(p_graph), p_ctx, _input_ctx.partition);
+  p_graph =
+      helper::uncoarsen_once(coarsener.get(), std::move(p_graph), p_ctx, _input_ctx.partition);
 
   // The Context object is used to pre-allocate memory for the finest graph of the input hierarchy
   // Since this refiner is never used for the finest graph, we need to adjust the context to
@@ -52,7 +61,7 @@ PartitionedGraph AsyncInitialPartitioner::partition_recursive(
   const BlockID k_prime = helper::compute_k_for_n(p_graph.n(), _input_ctx);
   if (p_graph.k() < k_prime) {
     helper::extend_partition(
-        p_graph, k_prime, _input_ctx, p_ctx, _ip_extraction_pool, _ip_m_ctx_pool
+        p_graph, k_prime, _input_ctx, p_ctx, _ip_extraction_pool, _ip_m_ctx_pool, num_threads
     );
   }
 
@@ -65,7 +74,7 @@ PartitionedGraph AsyncInitialPartitioner::split_and_join(
     const bool converged,
     const std::size_t num_threads
 ) {
-  const Graph *graph = coarsener->coarsest_graph();
+  const Graph *graph = &coarsener->current();
   const std::size_t num_copies =
       helper::compute_num_copies(_input_ctx, graph->n(), converged, num_threads);
   const std::size_t threads_per_copy = num_threads / num_copies;
diff --git a/kaminpar-shm/partitioning/deep/async_initial_partitioning.h b/kaminpar-shm/partitioning/deep/async_initial_partitioning.h
index 9314a58f..99753229 100644
--- a/kaminpar-shm/partitioning/deep/async_initial_partitioning.h
+++ b/kaminpar-shm/partitioning/deep/async_initial_partitioning.h
@@ -14,13 +14,11 @@
 #include <tbb/task_group.h>
 #include <tbb/task_scheduler_observer.h>
 
-#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/partitioned_graph.h"
 #include "kaminpar-shm/partitioning/helper.h"
 
 namespace kaminpar::shm::partitioning {
 class AsyncInitialPartitioner {
-  static constexpr bool kDebug = false;
-
 public:
   AsyncInitialPartitioner(
       const Context &input_ctx,
diff --git a/kaminpar-shm/partitioning/deep/deep_multilevel.cc b/kaminpar-shm/partitioning/deep/deep_multilevel.cc
index 49c1ccc6..944c5d46 100644
--- a/kaminpar-shm/partitioning/deep/deep_multilevel.cc
+++ b/kaminpar-shm/partitioning/deep/deep_multilevel.cc
@@ -7,16 +7,20 @@
  ******************************************************************************/
 #include "kaminpar-shm/partitioning/deep/deep_multilevel.h"
 
+#include "kaminpar-shm/coarsening/max_cluster_weights.h"
+#include "kaminpar-shm/factories.h"
 #include "kaminpar-shm/partitioning/debug.h"
 #include "kaminpar-shm/partitioning/deep/async_initial_partitioning.h"
 #include "kaminpar-shm/partitioning/deep/sync_initial_partitioning.h"
+#include "kaminpar-shm/partitioning/helper.h"
 
 #include "kaminpar-common/console_io.h"
+#include "kaminpar-common/heap_profiler.h"
 
 namespace kaminpar::shm {
 namespace {
 SET_DEBUG(false);
-SET_STATISTICS(false);
+SET_STATISTICS_FROM_GLOBAL();
 } // namespace
 
 using namespace partitioning;
@@ -27,8 +31,10 @@ DeepMultilevelPartitioner::DeepMultilevelPartitioner(
     : _input_graph(input_graph),
       _input_ctx(input_ctx),
       _current_p_ctx(input_ctx.partition),
-      _coarsener(factory::create_coarsener(input_graph, input_ctx.coarsening)),
-      _refiner(factory::create_refiner(input_ctx)) {}
+      _coarsener(factory::create_coarsener(input_ctx)),
+      _refiner(factory::create_refiner(input_ctx)) {
+  _coarsener->initialize(&_input_graph);
+}
 
 PartitionedGraph DeepMultilevelPartitioner::partition() {
   cio::print_delimiter("Partitioning");
@@ -41,6 +47,7 @@ PartitionedGraph DeepMultilevelPartitioner::partition() {
   if (!refined || p_graph.k() < _input_ctx.partition.k) {
     LOG;
     LOG << "Toplevel:";
+    LOG << "  Number of nodes: " << p_graph.n() << " | Number of edges: " << p_graph.m();
 
     if (!refined) {
       refine(p_graph);
@@ -65,20 +72,27 @@ PartitionedGraph DeepMultilevelPartitioner::uncoarsen_once(PartitionedGraph p_gr
 }
 
 void DeepMultilevelPartitioner::refine(PartitionedGraph &p_graph) {
+  SCOPED_HEAP_PROFILER("Refinement");
+
   // If requested, dump the current partition to disk before refinement ...
-  debug::dump_partition_hierarchy(p_graph, _coarsener->size(), "pre-refinement", _input_ctx);
+  debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "pre-refinement", _input_ctx);
 
   LOG << "  Running refinement on " << p_graph.k() << " blocks";
   helper::refine(_refiner.get(), p_graph, _current_p_ctx);
-  LOG << "    Cut:       " << metrics::edge_cut(p_graph);
-  LOG << "    Imbalance: " << metrics::imbalance(p_graph);
-  LOG << "    Feasible:  " << metrics::is_feasible(p_graph, _current_p_ctx);
+
+  if (_print_metrics) {
+    SCOPED_TIMER("Partition metrics");
+    LOG << "    Cut:       " << metrics::edge_cut(p_graph);
+    LOG << "    Imbalance: " << metrics::imbalance(p_graph);
+    LOG << "    Feasible:  " << metrics::is_feasible(p_graph, _current_p_ctx);
+  }
 
   // ... and dump it after refinement.
-  debug::dump_partition_hierarchy(p_graph, _coarsener->size(), "post-refinement", _input_ctx);
+  debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "post-refinement", _input_ctx);
 }
 
 void DeepMultilevelPartitioner::extend_partition(PartitionedGraph &p_graph, const BlockID k_prime) {
+  SCOPED_HEAP_PROFILER("Extending partition");
   LOG << "  Extending partition from " << p_graph.k() << " blocks to " << k_prime << " blocks";
   helper::extend_partition(
       p_graph,
@@ -87,18 +101,28 @@ void DeepMultilevelPartitioner::extend_partition(PartitionedGraph &p_graph, cons
       _current_p_ctx,
       _subgraph_memory,
       _ip_extraction_pool,
-      _ip_m_ctx_pool
+      _ip_m_ctx_pool,
+      _input_ctx.parallel.num_threads
   );
-  LOG << "    Cut:       " << metrics::edge_cut(p_graph);
-  LOG << "    Imbalance: " << metrics::imbalance(p_graph);
+
+  if (_print_metrics) {
+    SCOPED_TIMER("Partition metrics");
+    LOG << "    Cut:       " << metrics::edge_cut(p_graph);
+    LOG << "    Imbalance: " << metrics::imbalance(p_graph);
+  }
 }
 
 PartitionedGraph DeepMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph, bool &refined) {
+  SCOPED_HEAP_PROFILER("Uncoarsening");
+
   while (!_coarsener->empty()) {
     LOG;
-    LOG << "Uncoarsening -> Level " << _coarsener.get()->size();
+    LOG << "Uncoarsening -> Level " << (_coarsener->level() - 1);
 
     p_graph = uncoarsen_once(std::move(p_graph));
+
+    LOG << "  Number of nodes: " << p_graph.n() << " | Number of edges: " << p_graph.m();
+
     refine(p_graph);
     refined = true;
 
@@ -113,17 +137,23 @@ PartitionedGraph DeepMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph,
 }
 
 const Graph *DeepMultilevelPartitioner::coarsen() {
+  SCOPED_HEAP_PROFILER("Coarsening");
+
   const Graph *c_graph = &_input_graph;
   NodeID prev_c_graph_n = c_graph->n();
   EdgeID prev_c_graph_m = c_graph->m();
   bool shrunk = true;
 
+  bool search_subgraph_memory_size = true;
+  NodeID subgraph_memory_n;
+  EdgeID subgraph_memory_m;
+
   while (shrunk && c_graph->n() > initial_partitioning_threshold()) {
     // If requested, dump graph before each coarsening step + after coarsening
     // converged. This way, we also have a dump of the (reordered) input graph,
     // which makes it easier to use the final partition (before reordering it).
     // We dump the coarsest graph in ::initial_partitioning().
-    debug::dump_graph_hierarchy(*c_graph, _coarsener->size(), _input_ctx);
+    debug::dump_graph_hierarchy(*c_graph, _coarsener->level(), _input_ctx);
 
     // Store the size of the previous coarse graph, so that we can pre-allocate _subgraph_memory
     // if we need it for this graph (see below)
@@ -131,30 +161,45 @@ const Graph *DeepMultilevelPartitioner::coarsen() {
     prev_c_graph_m = c_graph->m();
 
     // Build next coarse graph
-    shrunk = helper::coarsen_once(_coarsener.get(), c_graph, _input_ctx, _current_p_ctx);
-    c_graph = _coarsener->coarsest_graph();
+    shrunk = helper::coarsen_once(_coarsener.get(), c_graph, _current_p_ctx);
+    c_graph = &_coarsener->current();
 
     // _subgraph_memory stores the block-induced subgraphs of the partitioned graph during recursive
     // bipartitioning
     // To avoid repeated allocation, we pre-allocate the memory during coarsening for the largest
     // coarse graph for which we still need recursive bipartitioning
-    if (_subgraph_memory.empty() &&
+    if (search_subgraph_memory_size &&
         helper::compute_k_for_n(c_graph->n(), _input_ctx) < _input_ctx.partition.k) {
-      _subgraph_memory.resize(prev_c_graph_n, _input_ctx.partition.k, prev_c_graph_m, true, true);
+      search_subgraph_memory_size = false;
+      subgraph_memory_n = prev_c_graph_n;
+      subgraph_memory_m = prev_c_graph_m;
     }
 
     // Print some metrics for the coarse graphs
-    const NodeWeight max_cluster_weight =
-        compute_max_cluster_weight(_input_ctx.coarsening, *c_graph, _input_ctx.partition);
-    LOG << "Coarsening -> Level " << _coarsener.get()->size();
+    LOG << "Coarsening -> Level " << _coarsener->level();
+    if (const auto *graph = dynamic_cast<const CompactCSRGraph *>(c_graph->underlying_graph());
+        graph != nullptr) {
+      LOG << "  Compact Node IDs: " << graph->node_id_byte_width()
+          << " bytes | Compact edge weights: " << graph->edge_weight_byte_width() << " bytes";
+    }
     LOG << "  Number of nodes: " << c_graph->n() << " | Number of edges: " << c_graph->m();
-    LOG << "  Maximum node weight: " << c_graph->max_node_weight() << " <= " << max_cluster_weight;
+    LLOG << "  Maximum node weight: " << c_graph->max_node_weight() << " ";
+    LLOG << "<= "
+         << compute_max_cluster_weight<NodeWeight>(
+                _input_ctx.coarsening,
+                _input_ctx.partition,
+                c_graph->n(),
+                c_graph->total_node_weight()
+            );
+    LOG;
     LOG;
   }
 
-  if (_subgraph_memory.empty()) {
-    _subgraph_memory.resize(prev_c_graph_n, _input_ctx.partition.k, prev_c_graph_m, true, true);
+  if (search_subgraph_memory_size) {
+    subgraph_memory_n = prev_c_graph_n;
+    subgraph_memory_m = prev_c_graph_m;
   }
+  _subgraph_memory.resize(subgraph_memory_n, _input_ctx.partition.k, subgraph_memory_m, true, true);
 
   if (shrunk) {
     LOG << "==> Coarsening terminated with less than " << initial_partitioning_threshold()
@@ -177,6 +222,7 @@ NodeID DeepMultilevelPartitioner::initial_partitioning_threshold() {
 }
 
 PartitionedGraph DeepMultilevelPartitioner::initial_partition(const Graph *graph) {
+  SCOPED_HEAP_PROFILER("Initial partitioning");
   SCOPED_TIMER("Initial partitioning scheme");
   LOG << "Initial partitioning:";
 
@@ -186,7 +232,7 @@ PartitionedGraph DeepMultilevelPartitioner::initial_partition(const Graph *graph
   // Disable worker splitting with --p-deep-initial-partitioning-mode=sequential to obtain coarser
   // graphs.
   debug::dump_coarsest_graph(*graph, _input_ctx);
-  debug::dump_graph_hierarchy(*graph, _coarsener->size(), _input_ctx);
+  debug::dump_graph_hierarchy(*graph, _coarsener->level(), _input_ctx);
 
   // Since timers are not multi-threaded, we disable them during (parallel)
   // initial partitioning.
@@ -212,14 +258,17 @@ PartitionedGraph DeepMultilevelPartitioner::initial_partition(const Graph *graph
 
   // Print some metrics for the initial partition.
   LOG << "  Number of blocks: " << p_graph.k();
-  LOG << "  Cut:              " << metrics::edge_cut(p_graph);
-  LOG << "  Imbalance:        " << metrics::imbalance(p_graph);
-  LOG << "  Feasible:         " << (metrics::is_feasible(p_graph, _current_p_ctx) ? "yes" : "no");
+  if (_print_metrics) {
+    SCOPED_TIMER("Partition metrics");
+    LOG << "  Cut:              " << metrics::edge_cut(p_graph);
+    LOG << "  Imbalance:        " << metrics::imbalance(p_graph);
+    LOG << "  Feasible:         " << (metrics::is_feasible(p_graph, _current_p_ctx) ? "yes" : "no");
+  }
 
   // If requested, dump the coarsest partition -- as noted above, this is not
   // actually the coarsest partition when using deep multilevel.
   debug::dump_coarsest_partition(p_graph, _input_ctx);
-  debug::dump_partition_hierarchy(p_graph, _coarsener->size(), "post-refinement", _input_ctx);
+  debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "post-refinement", _input_ctx);
 
   return p_graph;
 }
diff --git a/kaminpar-shm/partitioning/deep/deep_multilevel.h b/kaminpar-shm/partitioning/deep/deep_multilevel.h
index 38397f79..1baf5f2c 100644
--- a/kaminpar-shm/partitioning/deep/deep_multilevel.h
+++ b/kaminpar-shm/partitioning/deep/deep_multilevel.h
@@ -9,11 +9,13 @@
 
 #include <tbb/enumerable_thread_specific.h>
 
+#include "kaminpar-shm/coarsening/coarsener.h"
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/graphutils/subgraph_extractor.h"
 #include "kaminpar-shm/kaminpar.h"
 #include "kaminpar-shm/partitioning/helper.h"
 #include "kaminpar-shm/partitioning/partitioner.h"
+#include "kaminpar-shm/refinement/refiner.h"
 
 namespace kaminpar::shm {
 class DeepMultilevelPartitioner : public Partitioner {
diff --git a/kaminpar-shm/partitioning/deep/sync_initial_partitioning.cc b/kaminpar-shm/partitioning/deep/sync_initial_partitioning.cc
index f7b089a6..7b6b074a 100644
--- a/kaminpar-shm/partitioning/deep/sync_initial_partitioning.cc
+++ b/kaminpar-shm/partitioning/deep/sync_initial_partitioning.cc
@@ -8,6 +8,8 @@
  ******************************************************************************/
 #include "kaminpar-shm/partitioning/deep/sync_initial_partitioning.h"
 
+#include "kaminpar-shm/factories.h"
+
 namespace kaminpar::shm::partitioning {
 SyncInitialPartitioner::SyncInitialPartitioner(
     const Context &input_ctx,
@@ -34,7 +36,7 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
   std::vector<std::size_t> num_local_copies_record;
 
   while (num_current_copies < num_threads) {
-    const NodeID n = coarseners.back()[0]->coarsest_graph()->n();
+    const NodeID n = coarseners.back()[0]->current().n();
     const std::size_t num_local_copies =
         helper::compute_num_copies(_input_ctx, n, converged, num_current_threads);
     num_local_copies_record.push_back(num_local_copies);
@@ -64,10 +66,7 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
     converged = true;
     tbb::parallel_for(static_cast<std::size_t>(0), num_current_copies, [&](const std::size_t i) {
       const bool shrunk = helper::coarsen_once(
-          next_coarseners[i].get(),
-          next_coarseners[i]->coarsest_graph(),
-          _input_ctx,
-          current_p_ctxs[i]
+          next_coarseners[i].get(), &next_coarseners[i]->current(), current_p_ctxs[i]
       );
       if (shrunk) {
         converged = false;
@@ -79,7 +78,7 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
   std::vector<PartitionedGraph> current_p_graphs(num_threads);
   tbb::parallel_for(static_cast<std::size_t>(0), num_threads, [&](const std::size_t i) {
     auto &current_coarseners = coarseners.back();
-    const Graph *graph = current_coarseners[i]->coarsest_graph();
+    const Graph *graph = &current_coarseners[i]->current();
     current_p_graphs[i] =
         helper::bipartition(graph, _input_ctx.partition.k, _input_ctx, _ip_m_ctx_pool);
   });
@@ -96,7 +95,8 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
       auto &p_graph = current_p_graphs[i];
       auto &coarsener = current_coarseners[i];
       auto &p_ctx = current_p_ctxs[i];
-      p_graph = helper::uncoarsen_once(coarsener.get(), std::move(p_graph), p_ctx, _input_ctx.partition);
+      p_graph =
+          helper::uncoarsen_once(coarsener.get(), std::move(p_graph), p_ctx, _input_ctx.partition);
 
       // The Context object is used to pre-allocate memory for the finest graph of the input
       // hierarchy Since this refiner is never used for the finest graph, we need to adjust the
@@ -111,7 +111,7 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
       const BlockID k_prime = helper::compute_k_for_n(p_graph.n(), _input_ctx);
       if (p_graph.k() < k_prime) {
         helper::extend_partition(
-            p_graph, k_prime, _input_ctx, p_ctx, _ip_extraction_pool, _ip_m_ctx_pool
+            p_graph, k_prime, _input_ctx, p_ctx, _ip_extraction_pool, _ip_m_ctx_pool, num_threads
         );
       }
     });
@@ -122,7 +122,7 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
     std::vector<PartitionContext> next_p_ctxs(num_current_copies);
     std::vector<PartitionedGraph> next_p_graphs(num_current_copies);
 
-    tbb::parallel_for(static_cast<std::size_t>(0), num_current_copies, [&](const std::size_t i) {
+    tbb::parallel_for<std::size_t>(0, num_current_copies, [&](const std::size_t i) {
       // Join
       const std::size_t start_pos = i * num_local_copies;
       PartitionContext &p_ctx = current_p_ctxs[start_pos];
@@ -144,11 +144,13 @@ SyncInitialPartitioner::partition(const Coarsener *coarsener, const PartitionCon
   }
 
   KASSERT(coarseners.size() == 1u, "", assert::light);
-  KASSERT(&(current_p_graphs.front().graph()) == coarsener->coarsest_graph(), "", assert::light);
+  KASSERT(&(current_p_graphs.front().graph()) == &coarsener->current(), "", assert::light);
   return std::move(current_p_graphs.front());
 }
 
 std::unique_ptr<Coarsener> SyncInitialPartitioner::duplicate_coarsener(const Coarsener *coarsener) {
-  return factory::create_coarsener(*coarsener->coarsest_graph(), _input_ctx.coarsening);
+  auto duplication = factory::create_coarsener(_input_ctx);
+  duplication->initialize(&coarsener->current());
+  return duplication;
 }
 } // namespace kaminpar::shm::partitioning
diff --git a/kaminpar-shm/partitioning/helper.cc b/kaminpar-shm/partitioning/helper.cc
index ab645faf..9fee7e85 100644
--- a/kaminpar-shm/partitioning/helper.cc
+++ b/kaminpar-shm/partitioning/helper.cc
@@ -53,10 +53,29 @@ PartitionedGraph bipartition(
     const Context &input_ctx,
     GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool
 ) {
-  InitialPartitioner partitioner(*graph, input_ctx, final_k, ip_m_ctx_pool.local().get());
-  PartitionedGraph p_graph = partitioner.partition();
+  const auto *csr = dynamic_cast<const CSRGraph *>(graph->underlying_graph());
+
+  // If we work with something other than a CSRGraph, construct a CSR copy to call the initial
+  // partitioning code
+  // This should only be necessary if the graph is too small for coarsening *and* we are using the
+  // compressed mode
+  std::unique_ptr<CSRGraph> csr_cpy;
+  if (csr == nullptr) {
+    DBG << "Bipartitioning a non-CSR graph is not supported by the initial partitioning code: "
+           "constructing a CSR-graph copy of the given graph with n="
+        << graph->n() << ", m=" << graph->m();
+    DBG << "Note: this should only happen when partitioning a very small graph using the "
+           "compressed mode";
+
+    csr_cpy = std::make_unique<CSRGraph>(*graph);
+    csr = csr_cpy.get();
+  }
+
+  InitialPartitioner partitioner(*csr, input_ctx, final_k, ip_m_ctx_pool.local().get());
+  auto bipart = partitioner.partition().take_raw_partition();
   ip_m_ctx_pool.local().put(partitioner.free());
-  return p_graph;
+
+  return PartitionedGraph{PartitionedGraph::seq{}, *graph, 2, std::move(bipart)};
 }
 
 void extend_partition_recursive(
@@ -137,62 +156,93 @@ void extend_partition(
     PartitionContext &current_p_ctx,
     graph::SubgraphMemory &subgraph_memory,
     TemporaryGraphExtractionBufferPool &extraction_pool,
-    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool
+    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool,
+    const int num_active_threads
 ) {
+  if (input_ctx.partitioning.min_consecutive_seq_bipartitioning_levels > 0) {
+    // Depending on the coarsening level and the deep multilevel implementation, it can occur that
+    // this function is called with more threads than blocks in the graph partition. To avoid
+    // wasting threads, we only extend the partition a little at first, and then recurse until all
+    // threads can work on independent blocks.
+    // "min_consecutive_seq_bipartitioning_levels" parameterizes the term "a little": when set to 1,
+    // we have the most amount of parallelization, but waste time by re-extracting the block-induced
+    // subgraphs from the partitioned graph; larger values do this less often at the cost of wasting
+    // more parallel compute resources.
+    // @todo change async_initial_partitioning.{cc, h} to make this obsolete ...
+    const int factor = 2 << (input_ctx.partitioning.min_consecutive_seq_bipartitioning_levels - 1);
+    while (k_prime > factor * p_graph.k() && num_active_threads > p_graph.k()) {
+      extend_partition(
+          p_graph,
+          factor * p_graph.k(),
+          input_ctx,
+          current_p_ctx,
+          subgraph_memory,
+          extraction_pool,
+          ip_m_ctx_pool,
+          num_active_threads
+      );
+    }
+  }
+
   SCOPED_TIMER("Initial partitioning");
 
+  START_HEAP_PROFILER("Extract subgraphs");
   auto extraction = TIMED_SCOPE("Extract subgraphs") {
     return extract_subgraphs(p_graph, input_ctx.partition.k, subgraph_memory);
   };
+  STOP_HEAP_PROFILER();
   const auto &subgraphs = extraction.subgraphs;
   const auto &mapping = extraction.node_mapping;
   const auto &positions = extraction.positions;
 
+  START_HEAP_PROFILER("Allocation");
   START_TIMER("Allocation");
   scalable_vector<StaticArray<BlockID>> subgraph_partitions;
   for (const auto &subgraph : subgraphs) {
     subgraph_partitions.emplace_back(subgraph.n());
   }
   STOP_TIMER();
+  STOP_HEAP_PROFILER();
 
+  START_HEAP_PROFILER("Bipartitioning");
   START_TIMER("Bipartitioning");
-  tbb::parallel_for(
-      static_cast<BlockID>(0),
-      static_cast<BlockID>(subgraphs.size()),
-      [&](const BlockID b) {
-        const auto &subgraph = subgraphs[b];
-        const BlockID final_kb = compute_final_k(b, p_graph.k(), input_ctx.partition.k);
-
-        const BlockID subgraph_k =
-            (k_prime == input_ctx.partition.k) ? final_kb : k_prime / p_graph.k();
-
-        if (subgraph_k > 1) {
-          DBG << "initial extend_partition_recursive() for block " << b << ", final k " << final_kb
-              << ", subgraph k " << subgraph_k << ", weight " << p_graph.block_weight(b) << " /// "
-              << subgraph.total_node_weight();
-
-          extend_partition_recursive(
-              subgraph,
-              subgraph_partitions[b],
-              0,
-              subgraph_k,
-              final_kb,
-              input_ctx,
-              subgraph_memory,
-              positions[b],
-              extraction_pool,
-              ip_m_ctx_pool
-          );
-        }
-      }
-  );
+  tbb::parallel_for<BlockID>(0, subgraphs.size(), [&](const BlockID b) {
+    const auto &subgraph = subgraphs[b];
+    const BlockID final_kb = compute_final_k(b, p_graph.k(), input_ctx.partition.k);
+
+    const BlockID subgraph_k =
+        (k_prime == input_ctx.partition.k) ? final_kb : k_prime / p_graph.k();
+
+    if (subgraph_k > 1) {
+      DBG << "initial extend_partition_recursive() for block " << b << ", final k " << final_kb
+          << ", subgraph k " << subgraph_k << ", weight " << p_graph.block_weight(b) << " /// "
+          << subgraph.total_node_weight();
+
+      extend_partition_recursive(
+          subgraph,
+          subgraph_partitions[b],
+          0,
+          subgraph_k,
+          final_kb,
+          input_ctx,
+          subgraph_memory,
+          positions[b],
+          extraction_pool,
+          ip_m_ctx_pool
+      );
+    }
+  });
   STOP_TIMER();
+  STOP_HEAP_PROFILER();
 
+  START_HEAP_PROFILER("Copy subgraph partitions");
   TIMED_SCOPE("Copy subgraph partitions") {
     p_graph = graph::copy_subgraph_partitions(
         std::move(p_graph), subgraph_partitions, k_prime, input_ctx.partition.k, mapping
     );
   };
+  STOP_HEAP_PROFILER();
+
   update_partition_context(current_p_ctx, p_graph, input_ctx.partition.k);
 
   KASSERT(p_graph.k() == k_prime);
@@ -205,7 +255,8 @@ void extend_partition(
     const Context &input_ctx,
     PartitionContext &current_p_ctx,
     TemporaryGraphExtractionBufferPool &extraction_pool,
-    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool
+    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool,
+    const int num_active_threads
 ) {
   graph::SubgraphMemory memory;
 
@@ -218,35 +269,39 @@ void extend_partition(
   );
 
   extend_partition(
-      p_graph, k_prime, input_ctx, current_p_ctx, memory, extraction_pool, ip_m_ctx_pool
+      p_graph,
+      k_prime,
+      input_ctx,
+      current_p_ctx,
+      memory,
+      extraction_pool,
+      ip_m_ctx_pool,
+      num_active_threads
   );
 }
 
-bool coarsen_once(
-    Coarsener *coarsener,
-    const Graph *graph,
-    const Context &input_ctx,
-    PartitionContext &current_p_ctx
-) {
+bool coarsen_once(Coarsener *coarsener, const Graph *graph, PartitionContext &current_p_ctx) {
   SCOPED_TIMER("Coarsening");
 
-  const NodeWeight max_cluster_weight =
-      compute_max_cluster_weight(input_ctx.coarsening, *graph, input_ctx.partition);
-  const auto [c_graph, shrunk] = coarsener->compute_coarse_graph(max_cluster_weight, 0);
+  const auto shrunk = coarsener->coarsen();
+  const auto &c_graph = coarsener->current();
 
+  // @todo always do this?
   if (shrunk) {
-    current_p_ctx.setup(*c_graph);
+    current_p_ctx.setup(c_graph);
   }
 
   return shrunk;
 }
 
 BlockID compute_k_for_n(const NodeID n, const Context &input_ctx) {
+  // Catch special case where log is negative:
   if (n < 2 * input_ctx.coarsening.contraction_limit) {
     return 2;
-  } // catch special case where log is negative
+  }
+
   const BlockID k_prime = 1 << math::ceil_log2(n / input_ctx.coarsening.contraction_limit);
-  return std::clamp(k_prime, static_cast<BlockID>(2), input_ctx.partition.k);
+  return std::clamp<BlockID>(k_prime, 2, input_ctx.partition.k);
 }
 
 std::size_t compute_num_copies(
@@ -254,21 +309,21 @@ std::size_t compute_num_copies(
 ) {
   KASSERT(num_threads > 0u);
 
-  // sequential base case?
+  // Sequential base case;
   const NodeID C = input_ctx.coarsening.contraction_limit;
   if (converged || n <= 2 * C) {
     return num_threads;
   }
 
-  // parallel case
+  // Parallel case:
   const std::size_t f = 1 << static_cast<std::size_t>(std::ceil(std::log2(1.0 * n / C)));
 
-  // continue with coarsening if the graph is still too large
+  // Continue with coarsening if the graph is still too large ...
   if (f > num_threads) {
     return 1;
   }
 
-  // split into groups
+  // ... otherwise, split into groups:
   return num_threads / f;
 }
 
diff --git a/kaminpar-shm/partitioning/helper.h b/kaminpar-shm/partitioning/helper.h
index b81d82fb..bdd80252 100644
--- a/kaminpar-shm/partitioning/helper.h
+++ b/kaminpar-shm/partitioning/helper.h
@@ -14,6 +14,7 @@
 #include "kaminpar-shm/graphutils/subgraph_extractor.h"
 #include "kaminpar-shm/initial_partitioning/initial_partitioning_facade.h"
 #include "kaminpar-shm/kaminpar.h"
+#include "kaminpar-shm/refinement/refiner.h"
 
 #include "kaminpar-common/assert.h"
 
@@ -98,7 +99,8 @@ void extend_partition(
     PartitionContext &current_p_ctx,
     graph::SubgraphMemory &subgraph_memory,
     TemporaryGraphExtractionBufferPool &extraction_pool,
-    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool
+    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool,
+    int num_active_threads
 );
 
 void extend_partition(
@@ -107,15 +109,11 @@ void extend_partition(
     const Context &input_ctx,
     PartitionContext &current_p_ctx,
     TemporaryGraphExtractionBufferPool &extraction_pool,
-    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool
+    GlobalInitialPartitionerMemoryPool &ip_m_ctx_pool,
+    int num_active_threads
 );
 
-bool coarsen_once(
-    Coarsener *coarsener,
-    const Graph *graph,
-    const Context &input_ctx,
-    PartitionContext &current_p_ctx
-);
+bool coarsen_once(Coarsener *coarsener, const Graph *graph, PartitionContext &current_p_ctx);
 
 // compute smallest k_prime such that it is a power of 2 and n / k_prime <= C
 BlockID compute_k_for_n(NodeID n, const Context &input_ctx);
@@ -130,11 +128,7 @@ template <typename Iterator>
 std::size_t select_best(
     const Iterator p_graphs_begin, const Iterator p_graphs_end, const PartitionContext &p_ctx
 ) {
-  SET_DEBUG(false);
-
-  KASSERT(p_graphs_begin < p_graphs_end, "cannot select best result from an empty range");
-  DBG << "Select best result from " << std::distance(p_graphs_begin, p_graphs_end) << " "
-      << (*p_graphs_begin).k() << "-way partitions";
+  KASSERT(p_graphs_begin < p_graphs_end, "cannot select the best partition from an empty range");
 
   std::size_t best_index = 0;
   std::size_t current_index = 0;
@@ -144,7 +138,7 @@ std::size_t select_best(
   for (auto it = p_graphs_begin; it != p_graphs_end; ++it) {
     const auto &result = *it;
     const bool current_feasible = metrics::is_feasible(result, p_ctx);
-    const EdgeWeight current_cut = metrics::edge_cut_seq(result);
+    const EdgeWeight current_cut = metrics::edge_cut(result);
 
     if ((current_feasible == best_feasible && current_cut < best_cut) ||
         current_feasible > best_feasible) {
diff --git a/kaminpar-shm/partitioning/kway/kway_multilevel.cc b/kaminpar-shm/partitioning/kway/kway_multilevel.cc
index 039ca530..1d5aabbe 100644
--- a/kaminpar-shm/partitioning/kway/kway_multilevel.cc
+++ b/kaminpar-shm/partitioning/kway/kway_multilevel.cc
@@ -7,9 +7,11 @@
  ******************************************************************************/
 #include "kaminpar-shm/partitioning/kway/kway_multilevel.h"
 
+#include "kaminpar-shm/factories.h"
 #include "kaminpar-shm/partitioning/debug.h"
 
 #include "kaminpar-common/console_io.h"
+#include "kaminpar-common/heap_profiler.h"
 
 namespace kaminpar::shm {
 using namespace partitioning;
@@ -20,8 +22,10 @@ KWayMultilevelPartitioner::KWayMultilevelPartitioner(
     : _input_graph(input_graph),
       _input_ctx(input_ctx),
       _current_p_ctx(input_ctx.partition),
-      _coarsener(factory::create_coarsener(input_graph, input_ctx.coarsening)),
-      _refiner(factory::create_refiner(input_ctx)) {}
+      _coarsener(factory::create_coarsener(input_ctx)),
+      _refiner(factory::create_refiner(input_ctx)) {
+  _coarsener->initialize(&_input_graph);
+}
 
 PartitionedGraph KWayMultilevelPartitioner::partition() {
   cio::print_delimiter("Partitioning");
@@ -29,24 +33,32 @@ PartitionedGraph KWayMultilevelPartitioner::partition() {
 }
 
 void KWayMultilevelPartitioner::refine(PartitionedGraph &p_graph) {
+  SCOPED_HEAP_PROFILER("Refinement");
+
   // If requested, dump the current partition to disk before refinement ...
-  debug::dump_partition_hierarchy(p_graph, _coarsener->size(), "pre-refinement", _input_ctx);
+  debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "pre-refinement", _input_ctx);
 
   helper::refine(_refiner.get(), p_graph, _current_p_ctx);
-  LOG << "  Cut:       " << metrics::edge_cut(p_graph);
-  LOG << "  Imbalance: " << metrics::imbalance(p_graph);
-  LOG << "  Feasible:  " << metrics::is_feasible(p_graph, _current_p_ctx);
+  if (_print_metrics) {
+    SCOPED_TIMER("Partition metrics");
+    LOG << "  Cut:       " << metrics::edge_cut(p_graph);
+    LOG << "  Imbalance: " << metrics::imbalance(p_graph);
+    LOG << "  Feasible:  " << metrics::is_feasible(p_graph, _current_p_ctx);
+  }
 
   // ... and dump it after refinement.
-  debug::dump_partition_hierarchy(p_graph, _coarsener->size(), "post-refinement", _input_ctx);
+  debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "post-refinement", _input_ctx);
 }
 
 PartitionedGraph KWayMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph) {
+  SCOPED_HEAP_PROFILER("Uncoarsening");
+
   refine(p_graph);
 
   while (!_coarsener->empty()) {
     LOG;
-    LOG << "Uncoarsening -> Level " << _coarsener.get()->size();
+    LOG << "Uncoarsening -> Level " << _coarsener->level();
+
     p_graph = helper::uncoarsen_once(
         _coarsener.get(), std::move(p_graph), _current_p_ctx, _input_ctx.partition
     );
@@ -57,6 +69,8 @@ PartitionedGraph KWayMultilevelPartitioner::uncoarsen(PartitionedGraph p_graph)
 }
 
 const Graph *KWayMultilevelPartitioner::coarsen() {
+  SCOPED_HEAP_PROFILER("Coarsening");
+
   const Graph *c_graph = &_input_graph;
   bool shrunk = true;
 
@@ -65,18 +79,29 @@ const Graph *KWayMultilevelPartitioner::coarsen() {
     // converged. This way, we also have a dump of the (reordered) input graph,
     // which makes it easier to use the final partition (before reordering it).
     // We dump the coarsest graph in ::initial_partitioning().
-    debug::dump_graph_hierarchy(*c_graph, _coarsener->size(), _input_ctx);
+    debug::dump_graph_hierarchy(*c_graph, _coarsener->level(), _input_ctx);
 
     // Build next coarse graph
-    shrunk = helper::coarsen_once(_coarsener.get(), c_graph, _input_ctx, _current_p_ctx);
-    c_graph = _coarsener->coarsest_graph();
+    shrunk = helper::coarsen_once(_coarsener.get(), c_graph, _current_p_ctx);
+    c_graph = &_coarsener->current();
 
     // Print some metrics for the coarse graphs
-    const NodeWeight max_cluster_weight =
-        compute_max_cluster_weight(_input_ctx.coarsening, *c_graph, _input_ctx.partition);
-    LOG << "Coarsening -> Level " << _coarsener.get()->size();
+    LOG << "Coarsening -> Level " << _coarsener->level();
+    if (const auto *graph = dynamic_cast<const CompactCSRGraph *>(c_graph->underlying_graph());
+        graph != nullptr) {
+      LOG << "  Compact Node IDs: " << graph->node_id_byte_width()
+          << " bytes | Compact edge weights: " << graph->edge_weight_byte_width() << " bytes";
+    }
     LOG << "  Number of nodes: " << c_graph->n() << " | Number of edges: " << c_graph->m();
-    LOG << "  Maximum node weight: " << c_graph->max_node_weight() << " <= " << max_cluster_weight;
+    LLOG << "  Maximum node weight: " << c_graph->max_node_weight() << " ";
+    LLOG << "<= "
+         << compute_max_cluster_weight<NodeWeight>(
+                _input_ctx.coarsening,
+                _input_ctx.partition,
+                c_graph->n(),
+                c_graph->total_node_weight()
+            );
+    LOG;
     LOG;
   }
 
@@ -97,6 +122,7 @@ NodeID KWayMultilevelPartitioner::initial_partitioning_threshold() {
 }
 
 PartitionedGraph KWayMultilevelPartitioner::initial_partition(const Graph *graph) {
+  SCOPED_HEAP_PROFILER("Initial partitioning");
   SCOPED_TIMER("Initial partitioning");
   LOG << "Initial partitioning:";
 
@@ -106,7 +132,7 @@ PartitionedGraph KWayMultilevelPartitioner::initial_partition(const Graph *graph
   // Disable worker splitting with --p-deep-initial-partitioning-mode=sequential to obtain coarser
   // graphs.
   debug::dump_coarsest_graph(*graph, _input_ctx);
-  debug::dump_graph_hierarchy(*graph, _coarsener->size(), _input_ctx);
+  debug::dump_graph_hierarchy(*graph, _coarsener->level(), _input_ctx);
 
   // Since timers are not multi-threaded, we disable them during (parallel)
   // initial partitioning.
@@ -125,21 +151,25 @@ PartitionedGraph KWayMultilevelPartitioner::initial_partition(const Graph *graph
       _current_p_ctx,
       subgraph_memory,
       ip_extraction_pool,
-      _ip_m_ctx_pool
+      _ip_m_ctx_pool,
+      _input_ctx.parallel.num_threads
   );
 
   helper::update_partition_context(_current_p_ctx, p_graph, _input_ctx.partition.k);
   ENABLE_TIMERS();
 
   // Print some metrics for the initial partition.
-  LOG << "  Cut:              " << metrics::edge_cut(p_graph);
-  LOG << "  Imbalance:        " << metrics::imbalance(p_graph);
-  LOG << "  Feasible:         " << (metrics::is_feasible(p_graph, _current_p_ctx) ? "yes" : "no");
+  if (_print_metrics) {
+    SCOPED_TIMER("Partition metrics");
+    LOG << "  Cut:              " << metrics::edge_cut(p_graph);
+    LOG << "  Imbalance:        " << metrics::imbalance(p_graph);
+    LOG << "  Feasible:         " << (metrics::is_feasible(p_graph, _current_p_ctx) ? "yes" : "no");
+  }
 
   // If requested, dump the coarsest partition -- as noted above, this is not
   // actually the coarsest partition when using deep multilevel.
   debug::dump_coarsest_partition(p_graph, _input_ctx);
-  debug::dump_partition_hierarchy(p_graph, _coarsener->size(), "post-refinement", _input_ctx);
+  debug::dump_partition_hierarchy(p_graph, _coarsener->level(), "post-refinement", _input_ctx);
 
   return p_graph;
 }
diff --git a/kaminpar-shm/partitioning/kway/kway_multilevel.h b/kaminpar-shm/partitioning/kway/kway_multilevel.h
index 97714e7e..e39d79a9 100644
--- a/kaminpar-shm/partitioning/kway/kway_multilevel.h
+++ b/kaminpar-shm/partitioning/kway/kway_multilevel.h
@@ -9,17 +9,16 @@
 
 #include <tbb/enumerable_thread_specific.h>
 
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/factories.h"
-#include "kaminpar-shm/graphutils/subgraph_extractor.h"
+#include "kaminpar-shm/kaminpar.h"
 #include "kaminpar-shm/partitioning/helper.h"
 #include "kaminpar-shm/partitioning/partitioner.h"
 
 namespace kaminpar::shm {
 class KWayMultilevelPartitioner : public Partitioner {
   SET_DEBUG(false);
-  SET_STATISTICS(false);
+  SET_STATISTICS_FROM_GLOBAL();
 
 public:
   KWayMultilevelPartitioner(const Graph &input_graph, const Context &input_ctx);
diff --git a/kaminpar-shm/partitioning/partitioner.h b/kaminpar-shm/partitioning/partitioner.h
index d80c0a93..a94618fe 100644
--- a/kaminpar-shm/partitioning/partitioner.h
+++ b/kaminpar-shm/partitioning/partitioner.h
@@ -14,5 +14,12 @@ class Partitioner {
 public:
   virtual ~Partitioner() = default;
   virtual PartitionedGraph partition() = 0;
+
+  void enable_metrics_output() {
+    _print_metrics = true;
+  }
+
+protected:
+  bool _print_metrics = false;
 };
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/partitioning/rb/rb_multilevel.h b/kaminpar-shm/partitioning/rb/rb_multilevel.h
index 5c4cf5f1..7ba2932b 100644
--- a/kaminpar-shm/partitioning/rb/rb_multilevel.h
+++ b/kaminpar-shm/partitioning/rb/rb_multilevel.h
@@ -9,10 +9,9 @@
 
 #include <tbb/parallel_invoke.h>
 
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/factories.h"
 #include "kaminpar-shm/graphutils/subgraph_extractor.h"
-#include "kaminpar-shm/initial_partitioning/initial_partitioning_facade.h"
 #include "kaminpar-shm/partition_utils.h"
 #include "kaminpar-shm/partitioning/helper.h"
 #include "kaminpar-shm/partitioning/partitioner.h"
@@ -69,11 +68,11 @@ class RBMultilevelPartitioner : public Partitioner {
   PartitionedGraph bipartition(const Graph &graph, const BlockID final_k) {
     using namespace partitioning;
 
-    auto coarsener = factory::create_coarsener(graph, _input_ctx.coarsening);
-
     // set k to 2 for max cluster weight computation
-    Context pseudo_input_ctx = _input_ctx;
-    pseudo_input_ctx.partition.k = 2;
+    PartitionContext bipart_ctx = _input_ctx.partition;
+    bipart_ctx.k = 2;
+    auto coarsener = factory::create_coarsener(_input_ctx, bipart_ctx);
+    coarsener->initialize(&graph);
 
     const Graph *c_graph = &graph;
 
@@ -82,8 +81,8 @@ class RBMultilevelPartitioner : public Partitioner {
         create_bipartition_context(graph, final_k / 2, final_k / 2, _input_ctx.partition);
     bool shrunk = true;
     while (shrunk && c_graph->n() > 2 * _input_ctx.coarsening.contraction_limit) {
-      shrunk = helper::coarsen_once(coarsener.get(), c_graph, pseudo_input_ctx, p_ctx);
-      c_graph = coarsener->coarsest_graph();
+      shrunk = helper::coarsen_once(coarsener.get(), c_graph, p_ctx);
+      c_graph = &coarsener->current();
     }
 
     // initial bipartitioning
diff --git a/kaminpar-shm/presets.cc b/kaminpar-shm/presets.cc
index 7195ae68..55c790a1 100644
--- a/kaminpar-shm/presets.cc
+++ b/kaminpar-shm/presets.cc
@@ -17,6 +17,8 @@ namespace kaminpar::shm {
 Context create_context_by_preset_name(const std::string &name) {
   if (name == "default") {
     return create_default_context();
+  } else if (name == "memory") {
+    return create_memory_context();
   } else if (name == "fast") {
     return create_fast_context();
   } else if (name == "largek") {
@@ -35,6 +37,7 @@ Context create_context_by_preset_name(const std::string &name) {
 std::unordered_set<std::string> get_preset_names() {
   return {
       "default",
+      "memory",
       "fast",
       "largek",
       "strong",
@@ -46,12 +49,15 @@ std::unordered_set<std::string> get_preset_names() {
 
 Context create_default_context() {
   return {
-      .rearrange_by = GraphOrdering::DEGREE_BUCKETS,
+      .compression = {.enabled = false, .may_dismiss = false},
+      .node_ordering = NodeOrdering::DEGREE_BUCKETS,
+      .edge_ordering = EdgeOrdering::NATURAL,
       .partitioning =
           {
               .mode = PartitioningMode::DEEP,
               .deep_initial_partitioning_mode = InitialPartitioningMode::ASYNCHRONOUS_PARALLEL,
               .deep_initial_partitioning_load = 1.0,
+              .min_consecutive_seq_bipartitioning_levels = 1,
           },
       .partition =
           {
@@ -62,23 +68,40 @@ Context create_default_context() {
       .coarsening =
           {
               // Context -> Coarsening
-              .algorithm = ClusteringAlgorithm::LABEL_PROPAGATION,
-              .lp =
+              .algorithm = CoarseningAlgorithm::CLUSTERING,
+              .clustering =
                   {
-                      // Context -> Coarsening -> Label Propagation
-                      .num_iterations = 5,
-                      .large_degree_threshold = 1000000,
-                      .max_num_neighbors = 200000,
-                      .two_hop_strategy = TwoHopStrategy::MATCH_THREADWISE,
-                      .two_hop_threshold = 0.5,
-                      .isolated_nodes_strategy =
-                          IsolatedNodesClusteringStrategy::MATCH_DURING_TWO_HOP,
+                      // Context -> Coarsening -> Clustering
+                      .algorithm = ClusteringAlgorithm::LEGACY_LABEL_PROPAGATION,
+                      .lp =
+                          {
+                              // Context -> Coarsening -> Clustering -> Label Propagation
+                              .num_iterations = 5,
+                              .large_degree_threshold = 1000000,
+                              .max_num_neighbors = 200000,
+                              .use_two_level_cluster_weight_vector = false,
+                              .use_two_phases = false,
+                              .second_phase_select_mode = SecondPhaseSelectMode::FULL_RATING_MAP,
+                              .second_phase_aggregation_mode = SecondPhaseAggregationMode::BUFFERED,
+                              .relabel_before_second_phase = false,
+                              .two_hop_strategy = TwoHopStrategy::MATCH_THREADWISE,
+                              .two_hop_threshold = 0.5,
+                              .isolated_nodes_strategy =
+                                  IsolatedNodesClusteringStrategy::MATCH_DURING_TWO_HOP,
+                          },
+                      .cluster_weight_limit = ClusterWeightLimit::EPSILON_BLOCK_WEIGHT,
+                      .cluster_weight_multiplier = 1.0,
+                      .max_mem_free_coarsening_level = 0,
+                  },
+              .contraction =
+                  {
+                      // Context -> Coarsening -> Contraction
+                      .mode = ContractionMode::BUFFERED,
+                      .edge_buffer_fill_fraction = 1,
+                      .use_compact_mapping = false,
                   },
               .contraction_limit = 2000,
-              .enforce_contraction_limit = false,
               .convergence_threshold = 0.05,
-              .cluster_weight_limit = ClusterWeightLimit::EPSILON_BLOCK_WEIGHT,
-              .cluster_weight_multiplier = 1.0,
           },
       .initial_partitioning =
           {
@@ -110,13 +133,19 @@ Context create_default_context() {
           {
               // Context -> Refinement
               .algorithms =
-                  {RefinementAlgorithm::GREEDY_BALANCER, RefinementAlgorithm::LABEL_PROPAGATION},
+                  {
+                      RefinementAlgorithm::GREEDY_BALANCER,
+                      RefinementAlgorithm::LEGACY_LABEL_PROPAGATION,
+                  },
               .lp =
                   {
                       // Context -> Refinement -> Label Propagation
                       .num_iterations = 5,
                       .large_degree_threshold = 1000000,
                       .max_num_neighbors = std::numeric_limits<NodeID>::max(),
+                      .use_two_phases = false,
+                      .second_phase_select_mode = SecondPhaseSelectMode::FULL_RATING_MAP,
+                      .second_phase_aggregation_mode = SecondPhaseAggregationMode::BUFFERED,
                   },
               .kway_fm =
                   {
@@ -158,7 +187,6 @@ Context create_default_context() {
       .debug =
           {
               .graph_name = "",
-
               .dump_graph_filename = "n%n_m%m_k%k_seed%seed.metis",
               .dump_partition_filename = "n%n_m%m_k%k_seed%seed.part",
 
@@ -172,11 +200,23 @@ Context create_default_context() {
   };
 }
 
+Context create_memory_context() {
+  Context ctx = create_default_context();
+  ctx.compression.enabled = true;
+  ctx.compression.may_dismiss = true;
+  ctx.coarsening.clustering.algorithm = ClusteringAlgorithm::LABEL_PROPAGATION;
+  ctx.coarsening.clustering.lp.use_two_phases = true;
+  ctx.coarsening.clustering.lp.use_two_level_cluster_weight_vector = true;
+  ctx.coarsening.clustering.max_mem_free_coarsening_level = 1;
+  ctx.coarsening.contraction.mode = ContractionMode::UNBUFFERED;
+  ctx.coarsening.contraction.use_compact_mapping = true;
+  return ctx;
+}
+
 Context create_fast_context() {
   Context ctx = create_default_context();
-  ctx.partitioning.deep_initial_partitioning_mode = InitialPartitioningMode::ASYNCHRONOUS_PARALLEL;
   ctx.partitioning.deep_initial_partitioning_load = 0.5;
-  ctx.coarsening.lp.num_iterations = 1;
+  ctx.coarsening.clustering.lp.num_iterations = 1;
   ctx.initial_partitioning.min_num_repetitions = 1;
   ctx.initial_partitioning.min_num_non_adaptive_repetitions = 1;
   ctx.initial_partitioning.max_num_repetitions = 1;
@@ -198,7 +238,7 @@ Context create_strong_context() {
 
   ctx.refinement.algorithms = {
       RefinementAlgorithm::GREEDY_BALANCER,
-      RefinementAlgorithm::LABEL_PROPAGATION,
+      RefinementAlgorithm::LEGACY_LABEL_PROPAGATION,
       RefinementAlgorithm::KWAY_FM,
       RefinementAlgorithm::GREEDY_BALANCER,
   };
diff --git a/kaminpar-shm/refinement/balancer/greedy_balancer.cc b/kaminpar-shm/refinement/balancer/greedy_balancer.cc
index 6f73cff5..776cf32b 100644
--- a/kaminpar-shm/refinement/balancer/greedy_balancer.cc
+++ b/kaminpar-shm/refinement/balancer/greedy_balancer.cc
@@ -16,21 +16,30 @@ namespace kaminpar::shm {
 void GreedyBalancer::initialize(const PartitionedGraph &) {}
 
 bool GreedyBalancer::refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) {
+  SCOPED_TIMER("Greedy Balancer");
+
   _p_graph = &p_graph;
   _p_ctx = &p_ctx;
 
-  KASSERT(_marker.capacity() >= _p_graph->n());
-
-  _marker.reset();
-  _stats.reset();
-
   const NodeWeight initial_overload = metrics::total_overload(*_p_graph, *_p_ctx);
   if (initial_overload == 0) {
     return true;
   }
 
-  const EdgeWeight initial_cut = IFDBG(metrics::edge_cut(*_p_graph));
+  // Lazy initialize the balancer
+  {
+    SCOPED_HEAP_PROFILER("Greedy Balancer Allocation");
+    SCOPED_TIMER("Greedy Balancer Allocation");
+
+    _marker.resize(_p_graph->n());
+    _pq.init(_p_graph->n(), _p_graph->k());
+    _pq_weight.resize(_p_graph->k());
+  }
+
+  _marker.reset();
+  _stats.reset();
 
+  const EdgeWeight initial_cut = IFDBG(metrics::edge_cut(*_p_graph));
   init_pq();
   const BlockWeight delta = perform_round();
   const NodeWeight new_overload = initial_overload - delta;
@@ -104,12 +113,12 @@ BlockWeight GreedyBalancer::perform_round() {
           overload_delta.local() += delta;
 
           // try to add neighbors of moved node to PQ
-          for (const NodeID v : _p_graph->adjacent_nodes(u)) {
+          _p_graph->adjacent_nodes(u, [&](const NodeID v) {
             if (!_marker.get(v) && _p_graph->block(v) == from) {
               add_to_pq(from, v);
             }
             _marker.set(v);
-          }
+          });
         } else {
           add_to_pq(from, u, u_weight, actual_relative_gain);
         }
@@ -245,7 +254,7 @@ GreedyBalancer::compute_gain(const NodeID u, const BlockID u_block) const {
   auto action = [&](auto &map) {
     // compute external degree to each adjacent block that can take u without
     // becoming overloaded
-    for (const auto [e, v] : _p_graph->neighbors(u)) {
+    _p_graph->neighbors(u, [&](const EdgeID e, const NodeID v) {
       const BlockID v_block = _p_graph->block(v);
       if (u_block != v_block &&
           _p_graph->block_weight(v_block) + u_weight <= _p_ctx->block_weights.max(v_block)) {
@@ -253,7 +262,7 @@ GreedyBalancer::compute_gain(const NodeID u, const BlockID u_block) const {
       } else if (u_block == v_block) {
         internal_degree += _p_graph->edge_weight(e);
       }
-    }
+    });
 
     // select neighbor that maximizes gain
     Random &rand = Random::instance();
diff --git a/kaminpar-shm/refinement/balancer/greedy_balancer.h b/kaminpar-shm/refinement/balancer/greedy_balancer.h
index 7b7811f8..55f76e56 100644
--- a/kaminpar-shm/refinement/balancer/greedy_balancer.h
+++ b/kaminpar-shm/refinement/balancer/greedy_balancer.h
@@ -19,11 +19,12 @@
 #include "kaminpar-common/datastructures/marker.h"
 #include "kaminpar-common/datastructures/rating_map.h"
 #include "kaminpar-common/logger.h"
+#include "kaminpar-common/parallel/atomic.h"
 
 namespace kaminpar::shm {
 class GreedyBalancer : public Refiner {
   SET_DEBUG(false);
-  SET_STATISTICS(false);
+  SET_STATISTICS_FROM_GLOBAL();
 
 public:
   struct Statistics {
@@ -80,11 +81,7 @@ class GreedyBalancer : public Refiner {
     }
   };
 
-  GreedyBalancer(const Context &ctx)
-      : _max_k(ctx.partition.k),
-        _pq(ctx.partition.n, ctx.partition.k),
-        _marker(ctx.partition.n),
-        _pq_weight(ctx.partition.k) {}
+  GreedyBalancer(const Context &ctx) : _max_k(ctx.partition.k) {}
 
   GreedyBalancer &operator=(const GreedyBalancer &) = delete;
   GreedyBalancer(const PartitionedGraph &) = delete;
diff --git a/kaminpar-shm/refinement/fm/fm_refiner.cc b/kaminpar-shm/refinement/fm/fm_refiner.cc
index 908d57d5..118a384f 100644
--- a/kaminpar-shm/refinement/fm/fm_refiner.cc
+++ b/kaminpar-shm/refinement/fm/fm_refiner.cc
@@ -30,7 +30,7 @@
 namespace kaminpar::shm {
 namespace {
 SET_DEBUG(false);
-SET_STATISTICS(true);
+SET_STATISTICS_FROM_GLOBAL();
 } // namespace
 
 std::unique_ptr<Refiner> create_fm_refiner(const Context &ctx) {
diff --git a/kaminpar-shm/refinement/fm/fm_refiner.h b/kaminpar-shm/refinement/fm/fm_refiner.h
index 6ea3d5a3..702eeac1 100644
--- a/kaminpar-shm/refinement/fm/fm_refiner.h
+++ b/kaminpar-shm/refinement/fm/fm_refiner.h
@@ -16,6 +16,7 @@
 #include "kaminpar-shm/refinement/refiner.h"
 
 #include "kaminpar-common/datastructures/binary_heap.h"
+#include "kaminpar-common/parallel/atomic.h"
 #include "kaminpar-common/random.h"
 
 namespace kaminpar::shm {
@@ -160,7 +161,7 @@ template <typename GainCache> struct SharedData {
         gain_cache(ctx, preallocate_n, preallocate_k),
         border_nodes(ctx, gain_cache, node_tracker),
         shared_pq_handles(preallocate_n, SharedBinaryMaxHeap<EdgeWeight>::kInvalidID),
-        target_blocks(static_array::noinit, preallocate_n) {}
+        target_blocks(preallocate_n, static_array::noinit) {}
 
   SharedData(const SharedData &) = delete;
   SharedData &operator=(const SharedData &) = delete;
diff --git a/kaminpar-shm/refinement/gains/dense_gain_cache.h b/kaminpar-shm/refinement/gains/dense_gain_cache.h
index 44d86cde..379a223b 100644
--- a/kaminpar-shm/refinement/gains/dense_gain_cache.h
+++ b/kaminpar-shm/refinement/gains/dense_gain_cache.h
@@ -43,8 +43,8 @@ template <typename DeltaPartitionedGraph, typename GainCache> class DenseDeltaGa
 
 template <bool iterate_nonadjacent_blocks = true, bool iterate_exact_gains = false>
 class DenseGainCache {
-  SET_DEBUG(true);
-  SET_STATISTICS(false);
+  SET_DEBUG(false);
+  SET_STATISTICS_FROM_GLOBAL();
 
   using Self = DenseGainCache<iterate_nonadjacent_blocks, iterate_exact_gains>;
   template <typename, typename> friend class DenseDeltaGainCache;
@@ -98,8 +98,8 @@ class DenseGainCache {
       : _ctx(ctx),
         // Since we do not know the size of the gain cache in advance (depends on vertex degrees),
         // we cannot preallocate it
-        _gain_cache(static_array::noinit, 0),
-        _weighted_degrees(static_array::noinit, preallocate_n) {}
+        _gain_cache(0, static_array::noinit),
+        _weighted_degrees(preallocate_n, static_array::noinit) {}
 
   void initialize(const PartitionedGraph &p_graph) {
     _n = p_graph.n();
diff --git a/kaminpar-shm/refinement/gains/hybrid_gain_cache.h b/kaminpar-shm/refinement/gains/hybrid_gain_cache.h
index d52348f8..2c0e77b1 100644
--- a/kaminpar-shm/refinement/gains/hybrid_gain_cache.h
+++ b/kaminpar-shm/refinement/gains/hybrid_gain_cache.h
@@ -44,8 +44,8 @@ class HybridGainCache {
   HybridGainCache(const Context &ctx, const NodeID preallocate_n, const BlockID preallocate_k)
       : _ctx(ctx),
         _on_the_fly_gain_cache(ctx, preallocate_n, preallocate_k),
-        _gain_cache(static_array::noinit, 1ul * preallocate_n * preallocate_k),
-        _weighted_degrees(static_array::noinit, preallocate_n) {}
+        _gain_cache(1ul * preallocate_n * preallocate_k, static_array::noinit),
+        _weighted_degrees(preallocate_n, static_array::noinit) {}
 
   void initialize(const PartitionedGraph &p_graph) {
     DBG << "Initialize high-degree gain cache for a graph with n=" << p_graph.n()
@@ -91,8 +91,8 @@ class HybridGainCache {
         << " blocks == " << gc_size << " slots";
 
     TIMED_SCOPE("Allocation") {
-      _weighted_degrees.resize(static_array::noinit, _n);
-      _gain_cache.resize(static_array::noinit, gc_size);
+      _weighted_degrees.resize(_n, static_array::noinit);
+      _gain_cache.resize(gc_size, static_array::noinit);
     };
 
     // Must initialize the on-the-fly gain cache before initializing the sparse part (i.e., calling
@@ -399,4 +399,3 @@ template <typename _DeltaPartitionedGraph, typename _GainCache> class HybridDelt
       _on_the_fly_delta_gain_cache;
 };
 } // namespace kaminpar::shm
-
diff --git a/kaminpar-shm/refinement/gains/sparse_gain_cache.h b/kaminpar-shm/refinement/gains/sparse_gain_cache.h
index cebd3446..4d62bb55 100644
--- a/kaminpar-shm/refinement/gains/sparse_gain_cache.h
+++ b/kaminpar-shm/refinement/gains/sparse_gain_cache.h
@@ -43,8 +43,8 @@ class SparseGainCache {
   SparseGainCache(
       const Context & /* ctx */, const NodeID preallocate_n, const BlockID preallocate_k
   )
-      : _gain_cache(static_array::noinit, 1ull * preallocate_n * preallocate_k),
-        _weighted_degrees(static_array::noinit, preallocate_n) {
+      : _gain_cache(1ull * preallocate_n * preallocate_k, static_array::noinit),
+        _weighted_degrees(preallocate_n, static_array::noinit) {
     DBG << "Pre-allocating sparse gain cache: " << preallocate_n << " nodes, " << preallocate_k
         << " blocks -> allocate " << preallocate_n * preallocate_k * sizeof(EdgeWeight) / 1024
         << " KiB";
@@ -60,11 +60,11 @@ class SparseGainCache {
       SCOPED_TIMER("Allocation");
       DBG << "Re-allocating sparse gain cache: " << _n << " nodes, " << _k << " blocks -> allocate "
           << gc_size * sizeof(EdgeWeight) / 1024 << " KiB";
-      _gain_cache.resize(static_array::noinit, gc_size);
+      _gain_cache.resize(gc_size, static_array::noinit);
     }
     if (_weighted_degrees.size() < _n) {
       SCOPED_TIMER("Allocation");
-      _weighted_degrees.resize(static_array::noinit, _n);
+      _weighted_degrees.resize(_n, static_array::noinit);
     }
 
     reset();
diff --git a/kaminpar-shm/refinement/lp/legacy_lp_refiner.cc b/kaminpar-shm/refinement/lp/legacy_lp_refiner.cc
new file mode 100644
index 00000000..2aa03cfc
--- /dev/null
+++ b/kaminpar-shm/refinement/lp/legacy_lp_refiner.cc
@@ -0,0 +1,159 @@
+/*******************************************************************************
+ * Parallel k-way label propagation refiner.
+ *
+ * @file:   lp_refiner.cc
+ * @author: Daniel Seemaier
+ * @date:   30.09.2021
+ ******************************************************************************/
+#include "kaminpar-shm/refinement/lp/legacy_lp_refiner.h"
+
+#include "kaminpar-shm/datastructures/csr_graph.h"
+#include "kaminpar-shm/datastructures/partitioned_graph.h"
+#include "kaminpar-shm/legacy_label_propagation.h"
+
+#include "kaminpar-common/assert.h"
+#include "kaminpar-common/timer.h"
+
+namespace kaminpar::shm {
+//
+// Private implementation
+//
+
+struct LegacyLabelPropagationRefinerConfig : public LegacyLabelPropagationConfig {
+  using ClusterID = BlockID;
+  using ClusterWeight = BlockWeight;
+  using RatingMap = ::kaminpar::RatingMap<EdgeWeight, NodeID, SparseMap<NodeID, EdgeWeight>>;
+  static constexpr bool kUseHardWeightConstraint = true;
+  static constexpr bool kReportEmptyClusters = false;
+};
+
+class LegacyLabelPropagationRefinerImpl final : public ChunkRandomdLegacyLabelPropagation<
+                                                    LegacyLabelPropagationRefinerImpl,
+                                                    LegacyLabelPropagationRefinerConfig> {
+  using Base = ChunkRandomdLegacyLabelPropagation<
+      LegacyLabelPropagationRefinerImpl,
+      LegacyLabelPropagationRefinerConfig>;
+  friend Base;
+
+  static constexpr std::size_t kInfiniteIterations = std::numeric_limits<std::size_t>::max();
+
+public:
+  LegacyLabelPropagationRefinerImpl(const Context &ctx) : _r_ctx(ctx.refinement) {
+    set_max_degree(_r_ctx.lp.large_degree_threshold);
+    set_max_num_neighbors(_r_ctx.lp.max_num_neighbors);
+  }
+
+  void initialize(const PartitionedGraph &p_graph) {
+    _graph = dynamic_cast<const CSRGraph *>(p_graph.graph().underlying_graph());
+    KASSERT(_graph != nullptr, "Graph must be a CSRGraph", assert::always);
+
+    allocate(p_graph.n(), p_graph.n(), p_graph.k());
+  }
+
+  bool refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) {
+    KASSERT(_graph == dynamic_cast<const CSRGraph *>(p_graph.graph().underlying_graph()));
+    KASSERT(p_graph.k() <= p_ctx.k);
+    _p_graph = &p_graph;
+    _p_ctx = &p_ctx;
+
+    Base::initialize(_graph, _p_ctx->k);
+
+    const std::size_t max_iterations =
+        _r_ctx.lp.num_iterations == 0 ? kInfiniteIterations : _r_ctx.lp.num_iterations;
+    for (std::size_t iteration = 0; iteration < max_iterations; ++iteration) {
+      SCOPED_TIMER("Label Propagation");
+      if (perform_iteration() == 0) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  using Base::expected_total_gain;
+
+public:
+  [[nodiscard]] BlockID initial_cluster(const NodeID u) {
+    return _p_graph->block(u);
+  }
+
+  [[nodiscard]] BlockWeight initial_cluster_weight(const BlockID b) {
+    return _p_graph->block_weight(b);
+  }
+
+  [[nodiscard]] BlockWeight cluster_weight(const BlockID b) {
+    return _p_graph->block_weight(b);
+  }
+
+  bool move_cluster_weight(
+      const BlockID old_block,
+      const BlockID new_block,
+      const BlockWeight delta,
+      const BlockWeight max_weight
+  ) {
+    return _p_graph->move_block_weight(old_block, new_block, delta, max_weight);
+  }
+
+  void init_cluster(const NodeID /* u */, const BlockID /* b */) {}
+
+  void init_cluster_weight(const BlockID /* b */, const BlockWeight /* weight */) {}
+
+  [[nodiscard]] BlockID cluster(const NodeID u) {
+    return _p_graph->block(u);
+  }
+  void move_node(const NodeID u, const BlockID block) {
+    _p_graph->set_block<false>(u, block);
+  }
+  [[nodiscard]] BlockID num_clusters() {
+    return _p_graph->k();
+  }
+  [[nodiscard]] BlockWeight max_cluster_weight(const BlockID block) {
+    return _p_ctx->block_weights.max(block);
+  }
+
+  bool accept_cluster(const Base::ClusterSelectionState &state) {
+    static_assert(std::is_signed_v<NodeWeight>);
+
+    const NodeWeight current_max_weight = max_cluster_weight(state.current_cluster);
+    const NodeWeight best_overload =
+        state.best_cluster_weight - max_cluster_weight(state.best_cluster);
+    const NodeWeight current_overload = state.current_cluster_weight - current_max_weight;
+    const NodeWeight initial_overload =
+        state.initial_cluster_weight - max_cluster_weight(state.initial_cluster);
+
+    return (state.current_gain > state.best_gain ||
+            (state.current_gain == state.best_gain &&
+             (current_overload < best_overload ||
+              (current_overload == best_overload && state.local_rand.random_bool())))) &&
+           (state.current_cluster_weight + state.u_weight < current_max_weight ||
+            current_overload < initial_overload || state.current_cluster == state.initial_cluster);
+  }
+
+  const CSRGraph *_graph = nullptr;
+  PartitionedGraph *_p_graph = nullptr;
+
+  const PartitionContext *_p_ctx;
+  const RefinementContext &_r_ctx;
+};
+
+//
+// Exposed wrapper
+//
+
+LegacyLabelPropagationRefiner::LegacyLabelPropagationRefiner(const Context &ctx)
+    : _impl(new LegacyLabelPropagationRefinerImpl(ctx)) {}
+
+LegacyLabelPropagationRefiner::~LegacyLabelPropagationRefiner() {
+  delete _impl;
+}
+
+void LegacyLabelPropagationRefiner::initialize(const PartitionedGraph &p_graph) {
+  _impl->initialize(p_graph);
+}
+
+bool LegacyLabelPropagationRefiner::refine(
+    PartitionedGraph &p_graph, const PartitionContext &p_ctx
+) {
+  return _impl->refine(p_graph, p_ctx);
+}
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/refinement/lp/legacy_lp_refiner.h b/kaminpar-shm/refinement/lp/legacy_lp_refiner.h
new file mode 100644
index 00000000..c44dcb58
--- /dev/null
+++ b/kaminpar-shm/refinement/lp/legacy_lp_refiner.h
@@ -0,0 +1,27 @@
+/*******************************************************************************
+ * Parallel k-way label propagation refiner.
+ *
+ * @file:   lp_refiner.h
+ * @author: Daniel Seemaier
+ * @date:   30.09.2021
+ ******************************************************************************/
+#pragma once
+
+#include "kaminpar-shm/datastructures/partitioned_graph.h"
+#include "kaminpar-shm/kaminpar.h"
+#include "kaminpar-shm/refinement/refiner.h"
+
+namespace kaminpar::shm {
+class LegacyLabelPropagationRefiner : public Refiner {
+public:
+  LegacyLabelPropagationRefiner(const Context &ctx);
+  ~LegacyLabelPropagationRefiner();
+
+  void initialize(const PartitionedGraph &p_graph) override;
+
+  bool refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) override;
+
+private:
+  class LegacyLabelPropagationRefinerImpl *_impl;
+};
+} // namespace kaminpar::shm
diff --git a/kaminpar-shm/refinement/lp/lp_refiner.cc b/kaminpar-shm/refinement/lp/lp_refiner.cc
index 3c095234..0c19160f 100644
--- a/kaminpar-shm/refinement/lp/lp_refiner.cc
+++ b/kaminpar-shm/refinement/lp/lp_refiner.cc
@@ -9,14 +9,16 @@
 
 #include "kaminpar-shm/label_propagation.h"
 
+#include "kaminpar-common/heap_profiler.h"
 #include "kaminpar-common/timer.h"
 
 namespace kaminpar::shm {
+
 //
-// Private implementation
+// Actual implementation -- not exposed in header
 //
 
-struct LabelPropagationRefinerConfig : public LabelPropagationConfig {
+struct LPRefinerConfig : public LabelPropagationConfig {
   using ClusterID = BlockID;
   using ClusterWeight = BlockWeight;
   using RatingMap = ::kaminpar::RatingMap<EdgeWeight, NodeID, SparseMap<NodeID, EdgeWeight>>;
@@ -24,38 +26,54 @@ struct LabelPropagationRefinerConfig : public LabelPropagationConfig {
   static constexpr bool kReportEmptyClusters = false;
 };
 
-class LabelPropagationRefinerImpl final : public ChunkRandomdLabelPropagation<
-                                              LabelPropagationRefinerImpl,
-                                              LabelPropagationRefinerConfig> {
-  using Base =
-      ChunkRandomdLabelPropagation<LabelPropagationRefinerImpl, LabelPropagationRefinerConfig>;
+template <typename Graph>
+class LPRefinerImpl final
+    : public ChunkRandomLabelPropagation<LPRefinerImpl<Graph>, LPRefinerConfig, Graph> {
+  using Base = ChunkRandomLabelPropagation<LPRefinerImpl<Graph>, LPRefinerConfig, Graph>;
   friend Base;
 
   static constexpr std::size_t kInfiniteIterations = std::numeric_limits<std::size_t>::max();
 
 public:
-  LabelPropagationRefinerImpl(const Context &ctx) : _r_ctx{ctx.refinement} {
-    allocate(ctx.partition.n, ctx.partition.n, ctx.partition.k);
-    set_max_degree(_r_ctx.lp.large_degree_threshold);
-    set_max_num_neighbors(_r_ctx.lp.max_num_neighbors);
+  using Permutations = Base::Permutations;
+
+  LPRefinerImpl(const Context &ctx, Permutations &permutations)
+      : Base(permutations),
+        _r_ctx(ctx.refinement) {
+    Base::preinitialize(ctx.partition.n, ctx.partition.k);
+    Base::set_max_degree(_r_ctx.lp.large_degree_threshold);
+    Base::set_max_num_neighbors(_r_ctx.lp.max_num_neighbors);
+    Base::set_use_two_phases(_r_ctx.lp.use_two_phases);
+    Base::set_second_phase_select_mode(_r_ctx.lp.second_phase_select_mode);
+    Base::set_second_phase_aggregation_mode(_r_ctx.lp.second_phase_aggregation_mode);
+    Base::set_relabel_before_second_phase(false);
   }
 
-  void initialize(const PartitionedGraph &p_graph) {
-    _graph = &p_graph.graph();
+  void allocate() {
+    SCOPED_HEAP_PROFILER("Allocation");
+    SCOPED_TIMER("Allocation");
+
+    Base::allocate();
+  }
+
+  void initialize(const Graph *graph) {
+    _graph = graph;
   }
 
   bool refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) {
-    KASSERT(_graph == &p_graph.graph());
+    KASSERT(_graph == p_graph.graph().underlying_graph());
     KASSERT(p_graph.k() <= p_ctx.k);
     _p_graph = &p_graph;
     _p_ctx = &p_ctx;
+
     Base::initialize(_graph, _p_ctx->k);
 
     const std::size_t max_iterations =
         _r_ctx.lp.num_iterations == 0 ? kInfiniteIterations : _r_ctx.lp.num_iterations;
     for (std::size_t iteration = 0; iteration < max_iterations; ++iteration) {
-      SCOPED_TIMER("Label Propagation");
-      if (perform_iteration() == 0) {
+      SCOPED_TIMER("Iteration", std::to_string(iteration));
+
+      if (Base::perform_iteration() == 0) {
         return false;
       }
     }
@@ -87,6 +105,10 @@ class LabelPropagationRefinerImpl final : public ChunkRandomdLabelPropagation<
     return _p_graph->move_block_weight(old_block, new_block, delta, max_weight);
   }
 
+  void reassign_cluster_weights(
+      const StaticArray<BlockID> & /* mapping */, const BlockID /* num_new_clusters */
+  ) {}
+
   void init_cluster(const NodeID /* u */, const BlockID /* b */) {}
 
   void init_cluster_weight(const BlockID /* b */, const BlockWeight /* weight */) {}
@@ -122,28 +144,106 @@ class LabelPropagationRefinerImpl final : public ChunkRandomdLabelPropagation<
             current_overload < initial_overload || state.current_cluster == state.initial_cluster);
   }
 
-  const Graph *_graph{nullptr};
-  PartitionedGraph *_p_graph{nullptr};
+  const Graph *_graph = nullptr;
+  PartitionedGraph *_p_graph = nullptr;
+
   const PartitionContext *_p_ctx;
   const RefinementContext &_r_ctx;
 };
 
+class LPRefinerImplWrapper {
+public:
+  LPRefinerImplWrapper(const Context &ctx)
+      : _csr_impl(std::make_unique<LPRefinerImpl<CSRGraph>>(ctx, _permutations)),
+        _compact_csr_impl(std::make_unique<LPRefinerImpl<CompactCSRGraph>>(ctx, _permutations)),
+        _compressed_impl(std::make_unique<LPRefinerImpl<CompressedGraph>>(ctx, _permutations)) {}
+
+  void initialize(const PartitionedGraph &p_graph) {
+    const Graph &graph = p_graph.graph();
+
+    if (auto *csr_graph = dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+        csr_graph != nullptr) {
+      _csr_impl->initialize(csr_graph);
+      return;
+    }
+
+    if (auto *compact_csr_graph = dynamic_cast<const CompactCSRGraph *>(graph.underlying_graph());
+        compact_csr_graph != nullptr) {
+      _compact_csr_impl->initialize(compact_csr_graph);
+      return;
+    }
+
+    if (auto *compressed_graph = dynamic_cast<const CompressedGraph *>(graph.underlying_graph());
+        compressed_graph != nullptr) {
+      _compressed_impl->initialize(compressed_graph);
+      return;
+    }
+
+    __builtin_unreachable();
+  }
+
+  bool refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) {
+    const auto specific_refine = [&](auto &impl) {
+      if (_freed) {
+        _freed = false;
+        impl.allocate();
+      } else {
+        impl.setup(std::move(_structs));
+      }
+
+      const bool found_improvement = impl.refine(p_graph, p_ctx);
+
+      _structs = impl.release();
+      return found_improvement;
+    };
+
+    SCOPED_TIMER("Label Propagation");
+    const Graph &graph = p_graph.graph();
+
+    if (auto *csr_graph = dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+        csr_graph != nullptr) {
+      return specific_refine(*_csr_impl);
+    }
+
+    if (auto *compact_csr_graph = dynamic_cast<const CompactCSRGraph *>(graph.underlying_graph());
+        compact_csr_graph != nullptr) {
+      return specific_refine(*_compact_csr_impl);
+    }
+
+    if (auto *compressed_graph = dynamic_cast<const CompressedGraph *>(graph.underlying_graph());
+        compressed_graph != nullptr) {
+      return specific_refine(*_compressed_impl);
+    }
+
+    __builtin_unreachable();
+  }
+
+private:
+  std::unique_ptr<LPRefinerImpl<CSRGraph>> _csr_impl;
+  std::unique_ptr<LPRefinerImpl<CompactCSRGraph>> _compact_csr_impl;
+  std::unique_ptr<LPRefinerImpl<CompressedGraph>> _compressed_impl;
+
+  // The data structures which are used by the LP refiner and are shared between the
+  // different implementations.
+  bool _freed = true;
+  LPRefinerImpl<Graph>::Permutations _permutations;
+  LPRefinerImpl<Graph>::DataStructures _structs;
+};
+
 //
 // Exposed wrapper
 //
 
 LabelPropagationRefiner::LabelPropagationRefiner(const Context &ctx)
-    : _impl{new LabelPropagationRefinerImpl(ctx)} {}
+    : _impl_wrapper(std::make_unique<LPRefinerImplWrapper>(ctx)) {}
 
-LabelPropagationRefiner::~LabelPropagationRefiner() {
-  delete _impl;
-}
+LabelPropagationRefiner::~LabelPropagationRefiner() = default;
 
 void LabelPropagationRefiner::initialize(const PartitionedGraph &p_graph) {
-  _impl->initialize(p_graph);
+  _impl_wrapper->initialize(p_graph);
 }
 
 bool LabelPropagationRefiner::refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) {
-  return _impl->refine(p_graph, p_ctx);
+  return _impl_wrapper->refine(p_graph, p_ctx);
 }
 } // namespace kaminpar::shm
diff --git a/kaminpar-shm/refinement/lp/lp_refiner.h b/kaminpar-shm/refinement/lp/lp_refiner.h
index a6f26c89..c9f653ac 100644
--- a/kaminpar-shm/refinement/lp/lp_refiner.h
+++ b/kaminpar-shm/refinement/lp/lp_refiner.h
@@ -7,22 +7,32 @@
  ******************************************************************************/
 #pragma once
 
-#include "kaminpar-shm/context.h"
+#include <memory>
+
+#include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
-#include "kaminpar-shm/kaminpar.h"
 #include "kaminpar-shm/refinement/refiner.h"
 
 namespace kaminpar::shm {
+
 class LabelPropagationRefiner : public Refiner {
 public:
   LabelPropagationRefiner(const Context &ctx);
-  ~LabelPropagationRefiner();
+
+  LabelPropagationRefiner(const LabelPropagationRefiner &) = delete;
+  LabelPropagationRefiner &operator=(const LabelPropagationRefiner &) = delete;
+
+  LabelPropagationRefiner(LabelPropagationRefiner &&) noexcept = default;
+  LabelPropagationRefiner &operator=(LabelPropagationRefiner &&) noexcept = default;
+
+  ~LabelPropagationRefiner() override;
 
   void initialize(const PartitionedGraph &p_graph) override;
 
   bool refine(PartitionedGraph &p_graph, const PartitionContext &p_ctx) override;
 
 private:
-  class LabelPropagationRefinerImpl *_impl;
+  std::unique_ptr<class LPRefinerImplWrapper> _impl_wrapper;
 };
+
 } // namespace kaminpar::shm
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index deb9612c..e45fc04d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -38,19 +38,26 @@ kaminpar_add_common_test(test_common_compact_hash_map common/datastructures/comp
 kaminpar_add_common_test(test_common_math common/math_test.cc)
 kaminpar_add_common_test(test_common_string common/strutils_test.cc)
 kaminpar_add_common_test(test_common_parallel_algorithm common/parallel/algorithm_test.cc)
+kaminpar_add_common_test(test_common_heap_profiler common/heap_profiler_test.cc)
+kaminpar_add_common_test(test_common_varint_codec_test common/varint_codec_test.cc)
+kaminpar_add_common_test(test_common_varint_run_length_codec_test common/varint_run_length_codec_test.cc)
+kaminpar_add_common_test(test_common_varint_stream_codec_test common/varint_stream_codec_test.cc)
 
 # KaMinPar -> End-to-end
 kaminpar_add_shm_test(test_shm_endtoend endtoend/shm_endtoend_test.cc)
 
 # KaMinPar -> Utils
 kaminpar_add_shm_test(test_shm_metrics shm/metrics_test.cc)
-kaminpar_add_shm_test(test_shm_subgraph_extraction shm/subgraph_extraction_test.cc)
-kaminpar_add_shm_test(test_shm_graph_utils shm/graph_utils_test.cc)
-kaminpar_add_shm_test(test_shm_partition_utils shm/partition_utils_test.cc)
+kaminpar_add_shm_test(test_shm_subgraph_extraction shm/graphutils/subgraph_extraction_test.cc)
+kaminpar_add_shm_test(test_shm_partition_utils shm/graphutils/partition_utils_test.cc)
+
+# KaMinPar -> Coarsening
+kaminpar_add_shm_test(test_shm_graph_utils shm/coarsening/cluster_contraction_test.cc)
 
 # KaMinPar -> Data structures
-kaminpar_add_shm_test(test_shm_graph shm/graph_test.cc)
+kaminpar_add_shm_test(test_shm_graph shm/datastructures/graph_test.cc)
 kaminpar_add_shm_test(test_shm_delta_partitioned_graph shm/datastructures/delta_partitioned_graph_test.cc)
+kaminpar_add_shm_test(test_shm_compressed_graph shm/datastructures/compressed_graph_test.cc)
 
 if (KAMINPAR_BUILD_DISTRIBUTED)
     include(cmake/KaTestrophe.cmake)
diff --git a/tests/basic_test_helpers.h b/tests/basic_test_helpers.h
index f56d00a1..8da54d83 100644
--- a/tests/basic_test_helpers.h
+++ b/tests/basic_test_helpers.h
@@ -3,7 +3,7 @@
 #include <string>
 
 namespace kaminpar::testing {
-std::string test_instance(const std::string &name) {
+inline std::string test_instance(const std::string &name) {
   using namespace std::literals;
   return "test_instances/"s + name;
 }
diff --git a/tests/common/heap_profiler_test.cc b/tests/common/heap_profiler_test.cc
new file mode 100644
index 00000000..9b1be9a6
--- /dev/null
+++ b/tests/common/heap_profiler_test.cc
@@ -0,0 +1,64 @@
+#include <iostream>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "kaminpar-common/heap_profiler.h"
+
+// Allocate memory such that the compiler does not optimise it away.
+#define ALLOC_ARR(name, size)                                                                      \
+  char *name = new char[size];                                                                     \
+  volatile auto name##_copy = *((char *)name);
+
+namespace kaminpar::heap_profiler {
+
+TEST(HeapProfilerTest, NewArrayOperator) {
+  const std::size_t size = 1024;
+
+  HeapProfiler::global().enable();
+
+  ALLOC_ARR(array, size)
+  delete[] array;
+
+  HeapProfiler::global().disable();
+
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+  EXPECT_EQ(size, HeapProfiler::global().get_alloc());
+  EXPECT_EQ(size, HeapProfiler::global().get_max_alloc());
+  EXPECT_EQ(1, HeapProfiler::global().get_allocs());
+  EXPECT_EQ(1, HeapProfiler::global().get_frees());
+#endif // KAMINPAR_ENABLE_HEAP_PROFILING
+}
+
+TEST(HeapProfilerTest, MaxAllocTest) {
+  HeapProfiler::global().enable();
+
+  ALLOC_ARR(array1, 1024);
+  delete[] array1;
+
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+  EXPECT_EQ(1024, HeapProfiler::global().get_max_alloc());
+#endif // KAMINPAR_ENABLE_HEAP_PROFILING
+
+  ALLOC_ARR(array2, 2048);
+  delete[] array2;
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+  EXPECT_EQ(2048, HeapProfiler::global().get_max_alloc());
+#endif // KAMINPAR_ENABLE_HEAP_PROFILING
+
+  ALLOC_ARR(array3, 128);
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+  EXPECT_EQ(2048, HeapProfiler::global().get_max_alloc());
+#endif // KAMINPAR_ENABLE_HEAP_PROFILING
+
+  ALLOC_ARR(array4, 4096);
+#ifdef KAMINPAR_ENABLE_HEAP_PROFILING
+  EXPECT_EQ(4224, HeapProfiler::global().get_max_alloc());
+#endif // KAMINPAR_ENABLE_HEAP_PROFILING
+  delete[] array3;
+  delete[] array4;
+
+  HeapProfiler::global().disable();
+}
+
+} // namespace kaminpar::heap_profiler
diff --git a/tests/common/varint_codec_test.cc b/tests/common/varint_codec_test.cc
new file mode 100644
index 00000000..e5680f71
--- /dev/null
+++ b/tests/common/varint_codec_test.cc
@@ -0,0 +1,152 @@
+#include <gmock/gmock.h>
+
+#include "kaminpar-common/varint_codec.h"
+
+using namespace kaminpar;
+
+template <
+    typename Int,
+    bool marked_codec = false,
+    bool marked = false,
+    typename LengthFun,
+    typename Encoder,
+    typename Decoder>
+void test_varint_codec(
+    const std::vector<Int> &values, LengthFun &&length, Encoder &&encode, Decoder &&decode
+) {
+  std::size_t total_len = 0;
+  std::vector<std::size_t> values_len;
+  for (const Int value : values) {
+    const std::size_t value_len = length(value);
+    values_len.push_back(value_len);
+    total_len += value_len;
+  }
+
+  auto ptr = std::make_unique<std::uint8_t[]>(total_len);
+
+  std::size_t i = 0;
+  std::uint8_t *encoded_ptr = ptr.get();
+  for (const Int value : values) {
+    const std::size_t value_len = encode(value, encoded_ptr);
+    encoded_ptr += value_len;
+
+    EXPECT_EQ(values_len[i++], value_len);
+  }
+
+  i = 0;
+  const std::uint8_t *decoded_ptr = ptr.get();
+  for (const Int value : values) {
+    if constexpr (marked_codec) {
+      const auto [decoded_value, marker_set, value_len] = decode(decoded_ptr);
+      decoded_ptr += value_len;
+
+      EXPECT_EQ(values_len[i++], value_len);
+      EXPECT_EQ(value, decoded_value);
+      EXPECT_EQ(marked, marker_set);
+    } else {
+      const auto [decoded_value, value_len] = decode(decoded_ptr);
+      decoded_ptr += value_len;
+
+      EXPECT_EQ(values_len[i++], value_len);
+      EXPECT_EQ(value, decoded_value);
+    }
+  }
+}
+
+template <typename Int> std::vector<Int> generate_values() {
+  std::vector<Int> values;
+
+  values.push_back(static_cast<Int>(0));
+  for (std::size_t i = 1; i < sizeof(Int) + 1; ++i) {
+    values.push_back((static_cast<Int>(1) << (i * 7)) - 1);
+    values.push_back(static_cast<Int>(1) << (i * 7));
+  }
+  values.push_back(std::numeric_limits<Int>::max());
+
+  return values;
+}
+
+template <typename Int> std::vector<Int> generate_signed_values() {
+  std::vector<Int> values;
+
+  values.push_back(static_cast<Int>(0));
+
+  values.push_back((static_cast<Int>(1) << 6) - 1);
+  values.push_back((static_cast<Int>(1) << 6));
+
+  values.push_back(-(static_cast<Int>(1) << 6) + 1);
+  values.push_back(-(static_cast<Int>(1) << 6));
+
+  for (std::size_t i = 1; i < sizeof(Int); ++i) {
+    values.push_back((static_cast<Int>(1) << (i * 7 + 6)) - 1);
+    values.push_back(static_cast<Int>(1) << (i * 7 + 6));
+    values.push_back(-(static_cast<Int>(1) << (i * 7 + 6)) + 1);
+    values.push_back(-static_cast<Int>(1) << (i * 7 + 6));
+  }
+  values.push_back(std::numeric_limits<Int>::max());
+  values.push_back(-std::numeric_limits<Int>::max());
+
+  return values;
+}
+
+template <typename Int, bool marked = false> void test_varint_codec() {
+  if constexpr (marked) {
+    std::vector<Int> values = generate_values<Int>();
+
+    test_varint_codec<Int, true, false>(
+        values,
+        [](const Int value) { return marked_varint_length<Int>(value); },
+        [](const Int value, std::uint8_t *ptr) {
+          return marked_varint_encode<Int>(value, false, ptr);
+        },
+        [](const std::uint8_t *ptr) { return marked_varint_decode<Int>(ptr); }
+    );
+
+    test_varint_codec<Int, true, true>(
+        values,
+        [](const Int value) { return marked_varint_length<Int>(value); },
+        [](const Int value, std::uint8_t *ptr) {
+          return marked_varint_encode<Int>(value, true, ptr);
+        },
+        [](const std::uint8_t *ptr) { return marked_varint_decode<Int>(ptr); }
+    );
+  } else if constexpr (std::numeric_limits<Int>::is_signed) {
+    test_varint_codec<Int>(
+        generate_signed_values<Int>(),
+        [](const Int value) { return signed_varint_length<Int>(value); },
+        [](const Int value, std::uint8_t *ptr) { return signed_varint_encode<Int>(value, ptr); },
+        [](const std::uint8_t *ptr) { return signed_varint_decode<Int>(ptr); }
+    );
+  } else {
+    std::vector<Int> values = generate_values<Int>();
+
+    test_varint_codec<Int>(
+        values,
+        [](const Int value) { return varint_length<Int>(value); },
+        [](const Int value, std::uint8_t *ptr) { return varint_encode<Int>(value, ptr); },
+        [](const std::uint8_t *ptr) { return varint_decode_general<Int>(ptr); }
+    );
+
+    test_varint_codec<Int>(
+        values,
+        [](const Int value) { return varint_length<Int>(value); },
+        [](const Int value, std::uint8_t *ptr) { return varint_encode<Int>(value, ptr); },
+        [](const std::uint8_t *ptr) { return varint_decode<Int>(ptr); }
+    );
+  }
+}
+
+TEST(VarIntCodecTest, varint_codec) {
+  test_varint_codec<std::uint32_t>();
+  test_varint_codec<std::uint64_t>();
+}
+
+TEST(VarIntCodecTest, signed_varint_codec) {
+  test_varint_codec<std::int32_t>();
+  test_varint_codec<std::int64_t>();
+}
+
+TEST(VarIntCodecTest, marked_varint_codec) {
+  test_varint_codec<std::uint32_t, true>();
+  test_varint_codec<std::uint64_t, true>();
+}
diff --git a/tests/common/varint_run_length_codec_test.cc b/tests/common/varint_run_length_codec_test.cc
new file mode 100644
index 00000000..0d876fcd
--- /dev/null
+++ b/tests/common/varint_run_length_codec_test.cc
@@ -0,0 +1,40 @@
+#include <gmock/gmock.h>
+
+#include "kaminpar-common/varint_run_length_codec.h"
+
+using namespace kaminpar;
+
+template <typename Int> void test_run_length_codec() {
+  const std::size_t len =
+      (1 + sizeof(Int)) * (sizeof(Int) + sizeof(Int) * VarIntRunLengthEncoder<Int>::kBufferSize) +
+      1;
+  auto ptr = std::make_unique<std::uint8_t[]>(len);
+
+  std::vector<Int> values;
+  for (std::size_t i = 0; i < sizeof(Int); ++i) {
+    values.push_back(static_cast<Int>(1) << (i * 8));
+  }
+  values.push_back(std::numeric_limits<Int>::max());
+  for (std::size_t i = 0; i < sizeof(Int); ++i) {
+    for (std::size_t j = 0; j < VarIntRunLengthEncoder<Int>::kBufferSize; ++j) {
+      values.push_back(static_cast<Int>(1) << (i * 8));
+    }
+  }
+
+  VarIntRunLengthEncoder<Int> rl_encoder(ptr.get());
+  std::size_t written = 0;
+  for (const Int value : values) {
+    written += rl_encoder.add(value);
+  }
+  rl_encoder.flush();
+
+  VarIntRunLengthDecoder<Int> rl_decoder(ptr.get());
+  std::size_t i = 0;
+  rl_decoder.decode(values.size(), [&](const Int value) { EXPECT_EQ(values[i++], value); });
+  EXPECT_EQ(i, values.size());
+}
+
+TEST(VarIntRunLengthCodecTest, run_length_codec) {
+  test_run_length_codec<std::uint32_t>();
+  test_run_length_codec<std::uint64_t>();
+}
diff --git a/tests/common/varint_stream_codec_test.cc b/tests/common/varint_stream_codec_test.cc
new file mode 100644
index 00000000..01977d53
--- /dev/null
+++ b/tests/common/varint_stream_codec_test.cc
@@ -0,0 +1,54 @@
+#include <gmock/gmock.h>
+
+#include "kaminpar-common/varint_stream_codec.h"
+
+using namespace kaminpar;
+
+template <typename Int> void test_varint_stream(const std::vector<Int> &values) {
+  auto ptr = std::make_unique<std::uint8_t[]>(values.size() * sizeof(Int) + values.size());
+
+  VarIntStreamEncoder<Int> encoder(ptr.get(), values.size());
+  for (const Int value : values) {
+    encoder.add(value);
+  }
+  encoder.flush();
+
+  VarIntStreamDecoder<Int> decoder(ptr.get(), values.size());
+  std::size_t i = 0;
+  decoder.decode(values.size(), [&](const Int value) { EXPECT_EQ(values[i++], value); });
+  EXPECT_EQ(i, values.size());
+}
+
+template <typename Int> void test_varint_stream() {
+  std::vector<Int> values;
+
+  for (std::size_t control_byte = 0; control_byte < 256; ++control_byte) {
+    for (std::uint8_t i = 0; i < 4; ++i) {
+      const std::uint8_t length = ((control_byte >> (2 * i)) & 0b11) + 1;
+      const Int value = static_cast<Int>(1) << (length * 7);
+      values.push_back(value);
+    };
+  }
+
+  test_varint_stream(values);
+}
+
+TEST(VarIntStreamCodecTest, varint_stream) {
+  test_varint_stream<std::uint32_t>();
+}
+
+template <typename Int> void test_varint_stream_remaining() {
+  for (std::uint8_t i = 0; i < 3; ++i) {
+    std::vector<Int> values;
+
+    for (std::uint8_t j = 0; j <= i; ++j) {
+      values.push_back(static_cast<Int>(1) << ((j + 1) * 7));
+    }
+
+    test_varint_stream(values);
+  };
+}
+
+TEST(VarIntStreamCodecTest, varint_stream_remaining) {
+  test_varint_stream_remaining<std::uint32_t>();
+}
diff --git a/tests/dist/coarsening/internal_cluster_contraction_test.cc b/tests/dist/coarsening/internal_cluster_contraction_test.cc
index 4b8a3bb2..0012c34e 100644
--- a/tests/dist/coarsening/internal_cluster_contraction_test.cc
+++ b/tests/dist/coarsening/internal_cluster_contraction_test.cc
@@ -58,12 +58,12 @@ TEST(ClusterReassignmentTest, stair_no_limit) {
 //
 
 TEST(ClusterReassignmentTest, twitter_2010_64pe_2copy_regression) {
-  const auto node_distribution = static_array::create_from<GlobalNodeID>({
+  const auto node_distribution = static_array::create<GlobalNodeID>({
       0,     1096,  2130,  3166,  4269,  5493,  6717,  7941,  9019,  10093, 11187,
       12411, 13635, 14763, 15987, 17041, 17957, 19181, 20395, 21619, 22843, 24067,
       25291, 26368, 27462, 28378, 29294, 30209, 31433, 32348, 33482, 34706, 35621,
   });
-  const auto cnode_distribution = static_array::create_from<GlobalNodeID>({
+  const auto cnode_distribution = static_array::create<GlobalNodeID>({
       0,     1094,  2128,  3160,  4260,  5484,  6705,  7924,  8986,  10051, 11141,
       12363, 13587, 14663, 15819, 16873, 17774, 18973, 20163, 21370, 22589, 23806,
       25008, 26070, 27127, 28041, 28951, 29860, 31072, 31961, 33095, 34310, 35224,
@@ -81,8 +81,8 @@ TEST(ClusterReassignmentTest, twitter_2010_64pe_2copy_regression) {
 }
 
 TEST(ClusterReassignmentTest, rgg2d_N7_M11_2pe_regression) {
-  const auto node_distribution = static_array::create_from<GlobalNodeID>({0, 57, 128});
-  const auto cnode_distribution = static_array::create_from<GlobalNodeID>({0, 57, 128});
+  const auto node_distribution = static_array::create<GlobalNodeID>({0, 57, 128});
+  const auto cnode_distribution = static_array::create<GlobalNodeID>({0, 57, 128});
   const double max_cnode_imbalance = 1.1;
 
   const auto shifts = compute_assignment_shifts(node_distribution, cnode_distribution, 1.1);
@@ -94,16 +94,16 @@ TEST(ClusterReassignmentTest, rgg2d_N7_M11_2pe_regression) {
 }
 
 //[Warning] pe_underload=0, 0, 2, 4, 8, 8, 8, 9, 9, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-//14, 14, 14, 18, 22, 23, 23, 23, 23, 23, 23, 23 pe_overload=0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 6, 6,
-//6, 6, 6, 6, 7, 9, 9, 9, 12, 14, 16, 16, 16, 16, 16, 18, 21, 21, 21, 24 total_overload=24
+// 14, 14, 14, 18, 22, 23, 23, 23, 23, 23, 23, 23 pe_overload=0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 6, 6,
+// 6, 6, 6, 6, 7, 9, 9, 9, 12, 14, 16, 16, 16, 16, 16, 18, 21, 21, 21, 24 total_overload=24
 
 TEST(ClusterReassignmentTest, twitter_2010_128pe_4copies_regression) {
-  const auto node_distribution = static_array::create_from<GlobalNodeID>(
+  const auto node_distribution = static_array::create<GlobalNodeID>(
       {0,     1321,  2622,  3927,  5257,  6702,  8048,  9379,  10725, 12046, 13503,
        14960, 16312, 17651, 19075, 20435, 21792, 23249, 24706, 26163, 27620, 29077,
        30534, 31991, 33134, 34158, 35098, 36038, 37495, 38952, 39892, 40950, 42407}
   );
-  const auto cnode_distribution = static_array::create_from<GlobalNodeID>(
+  const auto cnode_distribution = static_array::create<GlobalNodeID>(
       {0,     1321,  2620,  3923,  5249,  6667,  8012,  9341,  10687, 12003, 13460,
        14917, 16267, 17605, 19028, 20388, 21745, 23200, 24656, 26104, 27558, 29015,
        30471, 31927, 33066, 34086, 35025, 35965, 37421, 38878, 39818, 40876, 42333}
diff --git a/tests/dist/distributed_graph_builder.h b/tests/dist/distributed_graph_builder.h
index 105e949b..1b983467 100644
--- a/tests/dist/distributed_graph_builder.h
+++ b/tests/dist/distributed_graph_builder.h
@@ -94,18 +94,18 @@ class Builder {
     const EdgeID m = _edges.size();
     auto edge_distribution = mpi::build_distribution_from_local_count<GlobalEdgeID, vec>(m, _comm);
 
-    DistributedGraph graph{
-        static_array::create_from(_node_distribution),
-        static_array::create_from(edge_distribution),
-        static_array::create_from(_nodes),
-        static_array::create_from(_edges),
-        static_array::create_from(_node_weights),
-        static_array::create_from(_edge_weights),
-        static_array::create_from(_ghost_owner),
-        static_array::create_from(_ghost_to_global),
+    DistributedGraph graph(
+        static_array::create(_node_distribution),
+        static_array::create(edge_distribution),
+        static_array::create(_nodes),
+        static_array::create(_edges),
+        static_array::create(_node_weights),
+        static_array::create(_edge_weights),
+        static_array::create(_ghost_owner),
+        static_array::create(_ghost_to_global),
         build_static_ghost_node_mapping(_global_to_ghost),
         false,
-        _comm};
+        _comm);
 
     // If the graph does not have unit node weights, exchange ghost node weights
     // now
diff --git a/tests/dist/graphutils/block_extractor_test.cc b/tests/dist/graphutils/block_extractor_test.cc
index 32947999..d45e7951 100644
--- a/tests/dist/graphutils/block_extractor_test.cc
+++ b/tests/dist/graphutils/block_extractor_test.cc
@@ -202,11 +202,14 @@ TEST(GlobalGraphExtractionTest, extract_local_edges) {
   ASSERT_EQ(subgraph.n(), 20);
   EXPECT_EQ(subgraph.m(), 20);
 
+  auto csr_subgraph = dynamic_cast<const shm::CSRGraph *>(subgraph.underlying_graph());
+  ASSERT_NE(csr_subgraph, nullptr);
+
   for (const NodeID u : subgraph.nodes()) {
     EXPECT_EQ(subgraph.degree(u), 1);
-    const NodeID neighbor = subgraph.edge_target(subgraph.first_edge(u));
+    const NodeID neighbor = csr_subgraph->edge_target(csr_subgraph->first_edge(u));
     EXPECT_EQ(subgraph.degree(neighbor), 1);
-    const NodeID neighbor_neighbor = subgraph.edge_target(subgraph.first_edge(neighbor));
+    const NodeID neighbor_neighbor = csr_subgraph->edge_target(csr_subgraph->first_edge(neighbor));
     EXPECT_EQ(neighbor_neighbor, u);
   }
 }
@@ -249,25 +252,29 @@ TEST(GlobalGraphExtractionTest, extract_distributed_isolated_nodes) {
 }
 
 void expect_circle(const shm::Graph &graph) {
+  auto csr_graph = dynamic_cast<const shm::CSRGraph *>(graph.underlying_graph());
+  EXPECT_NE(csr_graph, nullptr);
+
   // Catch special case with just 2 nodes: expect a single edge between the two nodes
   if (graph.n() == 2) {
     EXPECT_EQ(graph.degree(0), 1);
     EXPECT_EQ(graph.degree(1), 1);
-    EXPECT_EQ(graph.edge_target(graph.first_edge(0)), 1);
-    EXPECT_EQ(graph.edge_target(graph.first_edge(1)), 0);
+    EXPECT_EQ(csr_graph->edge_target(csr_graph->first_edge(0)), 1);
+    EXPECT_EQ(csr_graph->edge_target(csr_graph->first_edge(1)), 0);
     return;
   }
 
   NodeID num_nodes_in_circle = 1;
   NodeID start = 0;
   NodeID prev = start;
-  NodeID cur = graph.degree(start) > 0 ? graph.edge_target(graph.first_edge(start)) : start;
+  NodeID cur =
+      graph.degree(start) > 0 ? csr_graph->edge_target(csr_graph->first_edge(start)) : start;
 
   while (cur != start) {
     EXPECT_EQ(graph.degree(cur), 2);
 
-    const NodeID neighbor1 = graph.edge_target(graph.first_edge(cur));
-    const NodeID neighbor2 = graph.edge_target(graph.first_edge(cur) + 1);
+    const NodeID neighbor1 = csr_graph->edge_target(csr_graph->first_edge(cur));
+    const NodeID neighbor2 = csr_graph->edge_target(csr_graph->first_edge(cur) + 1);
     EXPECT_TRUE(neighbor1 == prev || neighbor2 == prev);
 
     // move to next node
@@ -404,7 +411,7 @@ TEST(GlobalGraphExtractionTest, extract_local_edge_weights_in_circle_clique_grap
   // create clique/circle graph with rank as node weight
   auto graph = make_circle_clique_graph(2);
 
-  std::vector<std::tuple<EdgeID, EdgeID, EdgeWeight>> edge_weights;
+  std::vector<std::tuple<NodeID, NodeID, EdgeWeight>> edge_weights;
   edge_weights.emplace_back(0, 1, rank);
   edge_weights.emplace_back(1, 0, rank);
 
diff --git a/tests/shm/coarsening/cluster_contraction_test.cc b/tests/shm/coarsening/cluster_contraction_test.cc
new file mode 100644
index 00000000..5180b59b
--- /dev/null
+++ b/tests/shm/coarsening/cluster_contraction_test.cc
@@ -0,0 +1,246 @@
+#include "tests/shm/graph_factories.h"
+#include "tests/shm/graph_helpers.h"
+#include "tests/shm/matchers.h"
+
+#include "kaminpar-shm/coarsening/contraction/cluster_contraction.h"
+#include "kaminpar-shm/graphutils/permutator.h"
+#include "kaminpar-shm/graphutils/subgraph_extractor.h"
+
+#include "kaminpar-common/datastructures/static_array.h"
+
+using ::testing::AllOf;
+using ::testing::Ge;
+using ::testing::Gt;
+using ::testing::Le;
+using ::testing::Lt;
+using ::testing::UnorderedElementsAre;
+
+namespace kaminpar::shm::testing {
+
+TEST(ClusterContractionTest, ContractingToSingleNodeWorks) {
+  static constexpr auto GRID_LENGTH{2};
+  Graph graph = make_grid_graph(GRID_LENGTH, GRID_LENGTH);
+
+  for (const NodeID cluster : {0, 1, 2, 3}) {
+    StaticArray<NodeID> clustering =
+        static_array::create<NodeID>({cluster, cluster, cluster, cluster});
+    auto coarsened = contract_clustering(
+        graph, clustering, {.mode = ContractionMode::BUFFERED, .edge_buffer_fill_fraction = 1}
+    );
+
+    const auto &c_graph = coarsened->get();
+    EXPECT_THAT(c_graph.n(), 1);
+    EXPECT_THAT(c_graph.m(), 0);
+    EXPECT_THAT(c_graph.node_weight(0), graph.total_node_weight());
+  }
+}
+
+TEST(ClusterContractionTest, ContractingToSingletonsWorks) {
+  static constexpr auto GRID_LENGTH{2};
+  Graph graph = make_grid_graph(GRID_LENGTH, GRID_LENGTH);
+  change_node_weight(graph, 0, 1);
+  change_node_weight(graph, 1, 2);
+  change_node_weight(graph, 2, 3);
+  change_node_weight(graph, 3, 4);
+  graph.update_total_node_weight();
+
+  StaticArray<NodeID> clustering = static_array::create<NodeID>({0, 1, 2, 3});
+  auto coarsened = contract_clustering(
+      graph, clustering, {.mode = ContractionMode::BUFFERED, .edge_buffer_fill_fraction = 1}
+  );
+
+  const auto &c_graph = coarsened->get();
+  EXPECT_THAT(c_graph.n(), graph.n());
+  EXPECT_THAT(c_graph.m(), graph.m());
+  EXPECT_THAT(c_graph.total_node_weight(), graph.total_node_weight());
+  EXPECT_THAT(c_graph.total_edge_weight(), graph.total_edge_weight());
+
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(1, 2));
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(1, 3));
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(2, 4));
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(3, 4));
+}
+
+TEST(ClusterContractionTest, ContractingAllNodesButOneWorks) {
+  static constexpr auto GRID_LENGTH = 2;
+  Graph graph = make_grid_graph(GRID_LENGTH, GRID_LENGTH);
+
+  // 0--1
+  // |  |
+  // 2--3
+  StaticArray<NodeID> clustering = static_array::create<NodeID>({0, 1, 1, 1});
+  auto coarsened = contract_clustering(
+      graph, clustering, {.mode = ContractionMode::BUFFERED, .edge_buffer_fill_fraction = 1}
+  );
+
+  const auto &c_graph = coarsened->get();
+  EXPECT_THAT(c_graph.n(), 2);
+  EXPECT_THAT(c_graph.m(), 2); // one undirected edge
+  EXPECT_THAT(c_graph.total_node_weight(), graph.total_node_weight());
+  EXPECT_THAT(c_graph.total_edge_weight(), 2 * 2);
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(1, 3));
+}
+
+TEST(ClusterContractionTest, ContractingGridHorizontallyWorks) {
+  Graph graph = make_grid_graph(2, 4); // two rows, 4 columns, organized row by row
+  change_node_weight(graph, 0, 1);
+  change_node_weight(graph, 1, 2);
+  change_node_weight(graph, 2, 3);
+  change_node_weight(graph, 3, 4);
+  change_node_weight(graph, 4, 10);
+  change_node_weight(graph, 5, 20);
+  change_node_weight(graph, 6, 30);
+  change_node_weight(graph, 7, 40);
+  graph.update_total_node_weight();
+
+  StaticArray<NodeID> clustering = static_array::create<NodeID>({0, 1, 2, 3, 0, 1, 2, 3});
+  auto coarsened = contract_clustering(
+      graph, clustering, {.mode = ContractionMode::BUFFERED, .edge_buffer_fill_fraction = 1}
+  );
+
+  const auto &c_graph = coarsened->get();
+  const auto &raw_c_graph = *dynamic_cast<const CSRGraph *>(c_graph.underlying_graph());
+  EXPECT_THAT(raw_c_graph.n(), 4);
+  EXPECT_THAT(raw_c_graph.m(), 2 * 3);
+  EXPECT_THAT(raw_c_graph.raw_node_weights(), UnorderedElementsAre(11, 22, 33, 44));
+  EXPECT_THAT(raw_c_graph.total_node_weight(), graph.total_node_weight());
+  EXPECT_THAT(raw_c_graph.total_edge_weight(), 4 * 3);
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(11, 22));
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(22, 33));
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(33, 44));
+}
+
+TEST(ClusterContractionTest, ContractingGridVerticallyWorks) {
+  Graph graph = make_grid_graph(4, 2); // four columns, two rows, organized row by row
+  change_node_weight(graph, 0, 1);
+  change_node_weight(graph, 1, 10);
+  change_node_weight(graph, 2, 2);
+  change_node_weight(graph, 3, 20);
+  change_node_weight(graph, 4, 3);
+  change_node_weight(graph, 5, 30);
+  change_node_weight(graph, 6, 4);
+  change_node_weight(graph, 7, 40);
+  graph.update_total_node_weight();
+
+  StaticArray<NodeID> clustering = static_array::create<NodeID>({0, 0, 2, 2, 4, 4, 6, 6});
+  auto coarsened = contract_clustering(
+      graph, clustering, {.mode = ContractionMode::BUFFERED, .edge_buffer_fill_fraction = 1}
+  );
+
+  const auto &c_graph = coarsened->get();
+  const auto &raw_c_graph = *dynamic_cast<const CSRGraph *>(c_graph.underlying_graph());
+  EXPECT_THAT(raw_c_graph.n(), 4);
+  EXPECT_THAT(raw_c_graph.m(), 2 * 3);
+  EXPECT_THAT(raw_c_graph.raw_node_weights(), UnorderedElementsAre(11, 22, 33, 44));
+  EXPECT_THAT(raw_c_graph.total_node_weight(), graph.total_node_weight());
+  EXPECT_THAT(raw_c_graph.total_edge_weight(), 4 * 3);
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(11, 22));
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(22, 33));
+  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(33, 44));
+}
+
+//
+// Pseudo-sorting the nodes of a graph by degree
+//
+
+TEST(GraphPermutationTest, PermutationByNodeDegreeIsCorrect) {
+  // 5 3
+  //   |
+  // 1-2-0
+  //   |/
+  //   4
+  const StaticArray<EdgeID> nodes = static_array::create<NodeID>({0, 2, 3, 7, 8, 10, 10});
+
+  const auto permutations = graph::sort_by_degree_buckets(nodes);
+  const auto &permutation = permutations.old_to_new;
+  EXPECT_THAT(permutation[0], AllOf(Ge(2), Le(3)));
+  EXPECT_THAT(permutation[1], AllOf(Ge(0), Le(1)));
+  EXPECT_EQ(permutation[2], 4);
+  EXPECT_THAT(permutation[3], AllOf(Ge(0), Le(1)));
+  EXPECT_THAT(permutation[4], AllOf(Ge(2), Le(3)));
+  EXPECT_EQ(permutation[5], 5);
+}
+
+TEST(GraphPermutationTest, MovingIsolatedNodesToBackWorks) {
+  // node 0 1 2 3 4 5 6 7 8 9 10
+  // deg  0 0 1 1 1 0 0 1 1 0 0
+  const StaticArray<EdgeID> nodes =
+      static_array::create<EdgeID>({0, 0, 0, 1, 2, 3, 3, 3, 4, 5, 5, 5});
+  const auto permutations = graph::sort_by_degree_buckets(nodes);
+  const auto &permutation = permutations.old_to_new;
+
+  EXPECT_GE(permutation[0], 5);
+  EXPECT_GE(permutation[1], 5);
+  EXPECT_LE(permutation[2], 4);
+  EXPECT_LE(permutation[3], 4);
+  EXPECT_LE(permutation[4], 4);
+  EXPECT_GE(permutation[5], 5);
+  EXPECT_GE(permutation[6], 5);
+  EXPECT_LE(permutation[7], 4);
+  EXPECT_LE(permutation[8], 4);
+  EXPECT_GE(permutation[9], 5);
+  EXPECT_GE(permutation[10], 5);
+}
+
+//
+// Preprocessing
+//
+
+TEST(
+    PreprocessingTest, PreprocessingFacadeRemovesIsolatedNodesAndAdaptsEpsilonFromUnweightedGraph
+) {
+  /* 0
+   * 1--2--3        *--*--*
+   * 4  5  6    --> *
+   * |              |
+   * 7--8  9        *--*--*
+   * 10    11
+   */
+  Graph graph = make_graph({0, 0, 1, 3, 4, 5, 5, 5, 7, 8, 8, 8, 8}, {2, 1, 3, 2, 7, 4, 8, 7});
+
+  PartitionContext p_ctx;
+  p_ctx.k = 2;
+  p_ctx.epsilon = 0.17; // max block weight 7
+
+  graph = graph::rearrange_by_degree_buckets(*dynamic_cast<CSRGraph *>(graph.underlying_graph()));
+  graph::remove_isolated_nodes(graph, p_ctx);
+
+  EXPECT_EQ(graph.n(), 6);
+  EXPECT_EQ(graph.m(), 8);
+  for (const NodeID v : (*dynamic_cast<CSRGraph *>(graph.underlying_graph())).raw_edges()) {
+    EXPECT_LT(v, 7);
+  } // edges are valid
+
+  // total weight of new graph: 6, perfectly balanced block weight: 3
+  // hence eps' should be 1.3333....
+  EXPECT_THAT(p_ctx.epsilon, AllOf(Gt(1.33), Lt(1.34)));
+}
+
+//
+// Sequential graph extraction
+//
+TEST(SequentialGraphExtraction, SimpleSequentialBipartitionExtractionWorks) {
+  // 0--1--2     block 0
+  //-|--|--
+  // 3--4--5     block 1
+  const Graph graph = make_graph({0, 2, 5, 6, 8, 11, 12}, {1, 3, 0, 4, 2, 1, 0, 4, 3, 1, 5, 4});
+  PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 0, 0, 1, 1, 1});
+
+  graph::SubgraphMemory memory(p_graph);
+  graph::SubgraphMemoryStartPosition position(0, 0);
+  graph::TemporarySubgraphMemory buffer;
+  const auto [subgraphs, positions] =
+      graph::extract_subgraphs_sequential(p_graph, {1, 1}, position, memory, buffer);
+
+  for (const auto &subgraph : subgraphs) {
+    EXPECT_EQ(subgraph.n(), 3);
+    EXPECT_EQ(subgraph.m(), 4);
+    EXPECT_THAT(degrees(subgraph), UnorderedElementsAre(1, 1, 2));
+  }
+
+  EXPECT_EQ(positions[0].nodes_start_pos, 0);
+  EXPECT_EQ(positions[0].edges_start_pos, 0);
+  EXPECT_EQ(positions[1].nodes_start_pos, 4);
+  EXPECT_EQ(positions[1].edges_start_pos, 4);
+}
+} // namespace kaminpar::shm::testing
diff --git a/tests/shm/datastructures/compressed_graph_test.cc b/tests/shm/datastructures/compressed_graph_test.cc
new file mode 100644
index 00000000..76ee7a2e
--- /dev/null
+++ b/tests/shm/datastructures/compressed_graph_test.cc
@@ -0,0 +1,374 @@
+#include <bitset>
+#include <unordered_map>
+
+#include <gmock/gmock.h>
+
+#include "tests/shm/graph_factories.h"
+
+#include "kaminpar-shm/datastructures/compressed_graph.h"
+#include "kaminpar-shm/graphutils/permutator.h"
+
+#define HIGH_DEGREE_NUM (CompressedGraph::kHighDegreeThreshold * 5)
+#define TEST_ON_ALL_GRAPHS(test_function)                                                          \
+  test_function(make_empty_graph(0));                                                              \
+  test_function(make_empty_graph(100));                                                            \
+  test_function(make_path_graph(100));                                                             \
+  test_function(make_star_graph(100));                                                             \
+  test_function(make_grid_graph(100, 100));                                                        \
+  test_function(make_complete_bipartite_graph(100, 100));                                          \
+  test_function(make_complete_graph(100));                                                         \
+  test_function(make_matching_graph(100));                                                         \
+  test_function(make_star_graph(HIGH_DEGREE_NUM));
+
+#define TEST_ON_WEIGHTED_GRAPHS(test_function)                                                     \
+  test_function(make_complete_graph(100, [](const NodeID u, const NodeID v) {                      \
+    return static_cast<EdgeWeight>(u + v);                                                         \
+  }));                                                                                             \
+  test_function(make_complete_bipartite_graph(100, 100, [](const NodeID u, const NodeID v) {       \
+    return static_cast<EdgeWeight>(u + v);                                                         \
+  }));                                                                                             \
+  test_function(make_star_graph(HIGH_DEGREE_NUM, [](const NodeID u, const NodeID v) {              \
+    return static_cast<EdgeWeight>(u + v);                                                         \
+  }));
+
+namespace kaminpar::shm::testing {
+
+template <typename T> static bool operator==(const IotaRange<T> &a, const IotaRange<T> &b) {
+  return a.begin() == b.begin() && a.end() == b.end();
+};
+
+static void print_csr_graph(const CSRGraph &graph) {
+  std::cout << "Nodes: " << graph.n() << ", edges: " << graph.m()
+            << ", edge weights: " << (graph.edge_weighted() ? "yes" : "no") << "\n";
+
+  for (const NodeID node : graph.nodes()) {
+    std::cout << "Node " << node << ": ";
+
+    for (const auto [incident_edge, adjacent_node] : graph.neighbors(node)) {
+      std::cout << adjacent_node;
+
+      if (graph.edge_weighted()) {
+        std::cout << ' ' << graph.edge_weight(incident_edge);
+      }
+
+      std::cout << ", ";
+    }
+
+    std::cout << '\n';
+  }
+}
+
+static void print_compressed_graph(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  const auto &nodes = compressed_graph.raw_nodes();
+  const auto &compressed_edges = compressed_graph.raw_compressed_edges();
+
+  std::cout << "Nodes: " << nodes.size() << ", edges: " << compressed_edges.size() << "\n\n";
+  for (NodeID node = 0; node < nodes.size() - 1; ++node) {
+    std::cout << "Node: " << node << ", offset: " << nodes[node] << '\n';
+
+    const std::uint8_t *start = compressed_edges.data() + nodes[node];
+    const std::uint8_t *end = compressed_edges.data() + nodes[node + 1];
+
+    while (start < end) {
+      std::cout << std::bitset<8>(*start++) << ' ';
+    }
+    std::cout << '\n';
+  }
+
+  std::cout << '\n';
+}
+
+static void test_compressed_graph_size(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  EXPECT_EQ(csr_graph.n(), compressed_graph.n());
+  EXPECT_EQ(csr_graph.m(), compressed_graph.m());
+}
+
+TEST(CompressedGraphTest, compressed_graph_size) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_size);
+}
+
+static void test_compressed_graph_nodes_operation(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  EXPECT_TRUE(csr_graph.nodes() == compressed_graph.nodes());
+}
+
+TEST(CompressedGraphTest, compressed_graph_nodes_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_nodes_operation);
+}
+
+static void test_compressed_graph_edges_operation(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  EXPECT_TRUE(csr_graph.edges() == compressed_graph.edges());
+}
+
+TEST(CompressedGraphTest, compressed_graph_edges_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_edges_operation);
+}
+
+static void test_compressed_graph_degree_operation(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  for (const NodeID node : graph.nodes()) {
+    EXPECT_EQ(csr_graph.degree(node), compressed_graph.degree(node));
+  }
+}
+
+TEST(CompressedGraphTest, compressed_graph_degree_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_degree_operation);
+}
+
+static void test_compressed_graph_incident_edges_operation(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  for (const NodeID node : graph.nodes()) {
+    EXPECT_TRUE(csr_graph.incident_edges(node) == compressed_graph.incident_edges(node));
+  }
+}
+
+TEST(CompressedGraphTest, compressed_graph_incident_edges_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_incident_edges_operation);
+}
+
+template <bool rearrange> static void test_compressed_graph_adjacent_nodes_operation(Graph graph) {
+  auto &csr_graph = *dynamic_cast<CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  if constexpr (rearrange) {
+    graph::reorder_edges_by_compression(csr_graph);
+  }
+
+  std::vector<NodeID> graph_neighbours;
+  std::vector<NodeID> compressed_graph_neighbours;
+  for (const NodeID node : graph.nodes()) {
+    graph.adjacent_nodes(node, [&](const NodeID adjacent_node) {
+      graph_neighbours.push_back(adjacent_node);
+    });
+
+    compressed_graph.adjacent_nodes(node, [&](const NodeID adjacent_node) {
+      compressed_graph_neighbours.push_back(adjacent_node);
+    });
+
+    EXPECT_EQ(graph_neighbours.size(), compressed_graph_neighbours.size());
+
+    if (!rearrange) {
+      std::sort(graph_neighbours.begin(), graph_neighbours.end());
+      std::sort(compressed_graph_neighbours.begin(), compressed_graph_neighbours.end());
+    }
+
+    EXPECT_TRUE(graph_neighbours == compressed_graph_neighbours);
+
+    graph_neighbours.clear();
+    compressed_graph_neighbours.clear();
+  }
+}
+
+TEST(CompressedGraphTest, compressed_graph_adjacent_nodes_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_adjacent_nodes_operation<false>);
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_adjacent_nodes_operation<true>);
+}
+
+template <bool rearrange> static void test_compressed_graph_neighbors_operation(Graph graph) {
+  auto &csr_graph = *dynamic_cast<CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  if constexpr (rearrange) {
+    graph::reorder_edges_by_compression(csr_graph);
+  }
+
+  std::vector<EdgeID> graph_incident_edges;
+  std::vector<NodeID> graph_adjacent_node;
+  std::vector<EdgeID> compressed_graph_incident_edges;
+  std::vector<NodeID> compressed_graph_adjacent_node;
+  for (const NodeID node : graph.nodes()) {
+    for (const auto [incident_edge, adjacent_node] : graph.neighbors(node)) {
+      graph_incident_edges.push_back(incident_edge);
+      graph_adjacent_node.push_back(adjacent_node);
+    }
+
+    compressed_graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      compressed_graph_incident_edges.push_back(incident_edge);
+      compressed_graph_adjacent_node.push_back(adjacent_node);
+    });
+
+    EXPECT_EQ(graph_incident_edges.size(), compressed_graph_incident_edges.size());
+
+    if (!rearrange) {
+      std::sort(graph_incident_edges.begin(), graph_incident_edges.end());
+      std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end());
+      std::sort(compressed_graph_incident_edges.begin(), compressed_graph_incident_edges.end());
+      std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end());
+    }
+
+    EXPECT_TRUE(graph_incident_edges == compressed_graph_incident_edges);
+    EXPECT_TRUE(graph_adjacent_node == compressed_graph_adjacent_node);
+
+    graph_incident_edges.clear();
+    graph_adjacent_node.clear();
+    compressed_graph_incident_edges.clear();
+    compressed_graph_adjacent_node.clear();
+  }
+}
+
+TEST(CompressedGraphTest, compressed_graph_neighbors_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_neighbors_operation<false>);
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_neighbors_operation<true>);
+}
+
+static void test_compressed_graph_neighbors_lambda_max_operation(Graph graph) {
+  auto &csr_graph = *dynamic_cast<CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  graph::reorder_edges_by_compression(csr_graph);
+
+  std::vector<EdgeID> graph_incident_edges;
+  std::vector<NodeID> graph_adjacent_node;
+  std::vector<EdgeID> compressed_graph_incident_edges;
+  std::vector<NodeID> compressed_graph_adjacent_node;
+  for (const NodeID node : graph.nodes()) {
+    const NodeID max_neighbor_count = graph.degree(node) / 2;
+
+    csr_graph.neighbors(
+        node,
+        max_neighbor_count,
+        [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+          graph_incident_edges.push_back(incident_edge);
+          graph_adjacent_node.push_back(adjacent_node);
+        }
+    );
+
+    compressed_graph.neighbors(
+        node,
+        max_neighbor_count,
+        [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+          compressed_graph_incident_edges.push_back(incident_edge);
+          compressed_graph_adjacent_node.push_back(adjacent_node);
+        }
+    );
+
+    EXPECT_EQ(graph_incident_edges.size(), compressed_graph_incident_edges.size());
+    EXPECT_TRUE(graph_incident_edges == compressed_graph_incident_edges);
+    EXPECT_TRUE(graph_adjacent_node == compressed_graph_adjacent_node);
+
+    graph_incident_edges.clear();
+    graph_adjacent_node.clear();
+    compressed_graph_incident_edges.clear();
+    compressed_graph_adjacent_node.clear();
+  }
+}
+
+TEST(CompressedGraphTest, compressed_graph_neighbors_lambda_max_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_neighbors_lambda_max_operation);
+}
+
+static void test_compressed_graph_pfor_neighbors_operation(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  tbb::concurrent_vector<NodeID> graph_adjacent_node;
+  tbb::concurrent_vector<NodeID> compressed_graph_adjacent_node;
+  for (const NodeID node : graph.nodes()) {
+    graph.pfor_neighbors(
+        node,
+        std::numeric_limits<NodeID>::max(),
+        std::numeric_limits<NodeID>::max(),
+        [&](const EdgeID e, const NodeID v) { graph_adjacent_node.push_back(v); }
+    );
+
+    compressed_graph.pfor_neighbors(
+        node,
+        std::numeric_limits<NodeID>::max(),
+        std::numeric_limits<NodeID>::max(),
+        [&](const EdgeID e, const NodeID v) { compressed_graph_adjacent_node.push_back(v); }
+    );
+
+    EXPECT_EQ(graph_adjacent_node.size(), compressed_graph_adjacent_node.size());
+
+    std::sort(graph_adjacent_node.begin(), graph_adjacent_node.end());
+    std::sort(compressed_graph_adjacent_node.begin(), compressed_graph_adjacent_node.end());
+    EXPECT_TRUE(graph_adjacent_node == compressed_graph_adjacent_node);
+
+    graph_adjacent_node.clear();
+    compressed_graph_adjacent_node.clear();
+  }
+}
+
+TEST(CompressedGraphTest, compressed_graph_pfor_neighbors_operation) {
+  TEST_ON_ALL_GRAPHS(test_compressed_graph_pfor_neighbors_operation);
+}
+
+static void test_compressed_graph_edge_weights(const Graph &graph) {
+  const auto &csr_graph = *dynamic_cast<const CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  std::unordered_map<NodeID, EdgeWeight> csr_graph_edge_weights_map;
+  std::unordered_map<NodeID, EdgeWeight> compressed_graph_edge_weights_map;
+
+  for (const NodeID node : graph.nodes()) {
+    csr_graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      csr_graph_edge_weights_map[adjacent_node] = csr_graph.edge_weight(incident_edge);
+    });
+
+    compressed_graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      compressed_graph_edge_weights_map[adjacent_node] =
+          compressed_graph.edge_weight(incident_edge);
+    });
+
+    EXPECT_EQ(csr_graph_edge_weights_map.size(), compressed_graph_edge_weights_map.size());
+
+    for (const NodeID adjacent_node : csr_graph.adjacent_nodes(node)) {
+      EXPECT_TRUE(
+          csr_graph_edge_weights_map.find(adjacent_node) != csr_graph_edge_weights_map.end()
+      );
+
+      EXPECT_TRUE(
+          compressed_graph_edge_weights_map.find(adjacent_node) !=
+          compressed_graph_edge_weights_map.end()
+      );
+
+      EXPECT_TRUE(
+          csr_graph_edge_weights_map[adjacent_node] ==
+          compressed_graph_edge_weights_map[adjacent_node]
+      );
+    }
+
+    csr_graph_edge_weights_map.clear();
+    compressed_graph_edge_weights_map.clear();
+  }
+}
+
+TEST(CompressedGraphTest, compressed_graph_edge_weights) {
+  TEST_ON_WEIGHTED_GRAPHS(test_compressed_graph_edge_weights);
+}
+
+static void test_rearrange_compressed_edge_weights(Graph graph) {
+  auto &csr_graph = *dynamic_cast<CSRGraph *>(graph.underlying_graph());
+  const auto compressed_graph = CompressedGraphBuilder::compress(csr_graph);
+
+  graph::reorder_edges_by_compression(csr_graph);
+
+  for (const NodeID node : graph.nodes()) {
+    graph.neighbors(node, [&](const EdgeID incident_edge, const NodeID adjacent_node) {
+      EXPECT_TRUE(
+          csr_graph.edge_weight(incident_edge) == compressed_graph.edge_weight(incident_edge)
+      );
+    });
+  }
+}
+
+TEST(CompressedGraphTest, rearrange_compressed_edge_weights) {
+  TEST_ON_WEIGHTED_GRAPHS(test_rearrange_compressed_edge_weights);
+}
+
+} // namespace kaminpar::shm::testing
diff --git a/tests/shm/datastructures/delta_partitioned_graph_test.cc b/tests/shm/datastructures/delta_partitioned_graph_test.cc
index 2c2551c1..87006765 100644
--- a/tests/shm/datastructures/delta_partitioned_graph_test.cc
+++ b/tests/shm/datastructures/delta_partitioned_graph_test.cc
@@ -4,37 +4,34 @@
 #include "kaminpar-shm/datastructures/delta_partitioned_graph.h"
 
 namespace kaminpar::shm::testing {
-namespace graphs {
-
 template <bool allow_read_after_move, bool compact_block_weight_delta>
 GenericDeltaPartitionedGraph<allow_read_after_move, compact_block_weight_delta>
-d_graph(const PartitionedGraph &p_graph) {
+make_d_graph(const PartitionedGraph &p_graph) {
   return GenericDeltaPartitionedGraph<allow_read_after_move, compact_block_weight_delta>(&p_graph);
 }
-} // namespace graphs
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_original_blocks_ram) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<true, false>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<true, false>(p_graph);
 
   EXPECT_EQ(d_graph.block(0), p_graph.block(0));
   EXPECT_EQ(d_graph.block(1), p_graph.block(1));
 }
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_original_blocks_nram) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<false, false>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<false, false>(p_graph);
 
   EXPECT_EQ(d_graph.block(0), p_graph.block(0));
   EXPECT_EQ(d_graph.block(1), p_graph.block(1));
 }
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_new_blocks_ram) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<true, false>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<true, false>(p_graph);
 
   EXPECT_EQ(d_graph.block(0), p_graph.block(0));
   d_graph.set_block(0, 1);
@@ -45,9 +42,9 @@ TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_new_blocks_ram) {
 }
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_new_blocks_nram) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<true, false>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<true, false>(p_graph);
 
   EXPECT_EQ(d_graph.block(0), p_graph.block(0));
   d_graph.set_block(0, 1);
@@ -57,27 +54,27 @@ TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_new_blocks_nram) {
 }
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_original_block_weights_compact) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<true, true>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<true, true>(p_graph);
 
   EXPECT_EQ(d_graph.block_weight(0), p_graph.block_weight(0));
   EXPECT_EQ(d_graph.block_weight(1), p_graph.block_weight(1));
 }
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_original_block_weights) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<true, false>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<true, false>(p_graph);
 
   EXPECT_EQ(d_graph.block_weight(0), p_graph.block_weight(0));
   EXPECT_EQ(d_graph.block_weight(1), p_graph.block_weight(1));
 }
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_modified_block_weights_compact) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<true, true>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<true, true>(p_graph);
 
   d_graph.set_block(0, 1);
   d_graph.set_block(1, 1);
@@ -87,9 +84,9 @@ TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_modified_block_weig
 }
 
 TEST(DeltaPartitionedGraphTest, two_node_graph_delta_returns_modified_block_weights) {
-  auto graph = graphs::empty(2);
-  auto p_graph = graphs::p_graph(graph, 2);
-  auto d_graph = graphs::d_graph<true, false>(p_graph);
+  auto graph = make_empty_graph(2);
+  auto p_graph = make_p_graph(graph, 2, {0, 0});
+  auto d_graph = make_d_graph<true, false>(p_graph);
 
   d_graph.set_block(0, 1);
   d_graph.set_block(1, 1);
diff --git a/tests/shm/graph_test.cc b/tests/shm/datastructures/graph_test.cc
similarity index 54%
rename from tests/shm/graph_test.cc
rename to tests/shm/datastructures/graph_test.cc
index 34b4ce99..8ed0988d 100644
--- a/tests/shm/graph_test.cc
+++ b/tests/shm/datastructures/graph_test.cc
@@ -1,9 +1,8 @@
 #include "tests/shm/graph_factories.h"
 #include "tests/shm/graph_helpers.h"
-#include "tests/shm/matchers.h"
-#include "tests/shm/test_helpers.h"
 
-#include "kaminpar-shm/graphutils/subgraph_extractor.h"
+#include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/datastructures/partitioned_graph.h"
 
 namespace kaminpar::shm::testing {
 class AWeightedGridGraph : public ::testing::Test {
@@ -11,54 +10,14 @@ class AWeightedGridGraph : public ::testing::Test {
   // 0|1--- 1|2--- 2|4--- 3|8
   // |    / |    / |    / |
   // 4|16---5|32---6|64---7|128
-  AWeightedGridGraph()
-      : graph{create_graph(
-            {0, 2, 6, 10, 13, 16, 20, 24, 26},
-            {1, 4, 0, 4, 5, 2, 1, 5, 6, 3, 2, 6, 7, 0, 1, 5, 4, 1, 2, 6, 5, 2, 3, 7, 6, 3},
-            {1, 2, 4, 8, 16, 32, 64, 128},
-            {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
-        )} {}
-
-  Graph graph;
+  Graph graph = make_graph(
+      {0, 2, 6, 10, 13, 16, 20, 24, 26},
+      {1, 4, 0, 4, 5, 2, 1, 5, 6, 3, 2, 6, 7, 0, 1, 5, 4, 1, 2, 6, 5, 2, 3, 7, 6, 3},
+      {1, 2, 4, 8, 16, 32, 64, 128},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+  );
 };
 
-//
-// Extracting block induced subgraphs from a partitioned graph
-//
-
-TEST_F(AWeightedGridGraph, ExtractingBlockInducedSubgraphsWorkHorizontally) {
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 0, 0, 0, 1, 1, 1, 1})};
-  graph::SubgraphMemory memory{p_graph};
-  const auto [subgraphs, node_mapping, positions] = extract_subgraphs(p_graph, 2, memory);
-  const auto &s_graph0 = subgraphs[0];
-  const auto &s_graph1 = subgraphs[1];
-
-  EXPECT_EQ(s_graph0.n(), 4);
-  EXPECT_EQ(s_graph0.m(), 6);
-  EXPECT_EQ(s_graph1.n(), 4);
-  EXPECT_EQ(s_graph1.m(), 6);
-
-  EXPECT_THAT(s_graph0, HasEdgeWithWeightedEndpoints(1, 2));
-  EXPECT_THAT(s_graph0, HasEdgeWithWeightedEndpoints(2, 4));
-  EXPECT_THAT(s_graph0, HasEdgeWithWeightedEndpoints(4, 8));
-  EXPECT_THAT(s_graph1, HasEdgeWithWeightedEndpoints(16, 32));
-  EXPECT_THAT(s_graph1, HasEdgeWithWeightedEndpoints(32, 64));
-  EXPECT_THAT(s_graph1, HasEdgeWithWeightedEndpoints(64, 128));
-}
-
-TEST_F(AWeightedGridGraph, ExtractingEmptyBlockInducedSubgraphWorks) {
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 0, 0, 0, 0, 0, 0, 0})};
-  graph::SubgraphMemory memory{p_graph};
-  const auto [subgraphs, node_mapping, positions] = extract_subgraphs(p_graph, 2, memory);
-  const auto &s_graph0 = subgraphs[0];
-  const auto &s_graph1 = subgraphs[1];
-
-  EXPECT_EQ(s_graph0.n(), graph.n());
-  EXPECT_EQ(s_graph0.m(), graph.m());
-  EXPECT_EQ(s_graph1.n(), 0);
-  EXPECT_EQ(s_graph1.m(), 0);
-}
-
 //
 // Node and edge weights
 //
@@ -76,13 +35,11 @@ TEST_F(AWeightedGridGraph, InitialEdgeWeightingWorks) {
 }
 
 TEST_F(AWeightedGridGraph, InitialTotalNodeWeightWorks) {
-  EXPECT_EQ(graph.total_node_weight(),
-            (1 << graph.n()) - 1); // graph has node weights 1, 2, 4, ...
+  EXPECT_EQ(graph.total_node_weight(), (1 << graph.n()) - 1);
 }
 
 TEST_F(AWeightedGridGraph, InitialTotalEdgeWeightWorks) {
-  EXPECT_EQ(graph.total_edge_weight(),
-            graph.m()); // graph has edge weights 1, 1, 1, ...
+  EXPECT_EQ(graph.total_edge_weight(), graph.m());
 }
 
 //
@@ -97,13 +54,13 @@ TEST_F(AWeightedGridGraph, DegreeWorks) {
 }
 
 TEST(GraphTest, DegreeWorksForLeaves) {
-  const Graph graph{create_graph({0, 1, 2}, {1, 0})};
+  const Graph graph = make_graph({0, 1, 2}, {1, 0});
   EXPECT_EQ(graph.degree(0), 1);
   EXPECT_EQ(graph.degree(1), 1);
 }
 
 TEST(GraphTest, DegreeWorksForGraphWithIsolatedNodes) {
-  const Graph graph{create_graph({0, 1, 1, 1, 2}, {3, 0})};
+  const Graph graph = make_graph({0, 1, 1, 1, 2}, {3, 0});
   EXPECT_EQ(graph.degree(0), 1);
   EXPECT_EQ(graph.degree(1), 0);
   EXPECT_EQ(graph.degree(2), 0);
@@ -115,7 +72,8 @@ TEST(GraphTest, DegreeWorksForGraphWithIsolatedNodes) {
 //
 
 TEST_F(AWeightedGridGraph, InitialBlockWeightsAreCorrect) {
-  PartitionedGraph p_graph{create_p_graph(graph, 4, {0, 0, 1, 1, 2, 2, 3, 3})};
+  PartitionedGraph p_graph = make_p_graph(graph, 4, {0, 0, 1, 1, 2, 2, 3, 3});
+
   EXPECT_EQ(p_graph.block_weight(0), 3);
   EXPECT_EQ(p_graph.block_weight(1), 12);
   EXPECT_EQ(p_graph.block_weight(2), 48);
@@ -123,7 +81,8 @@ TEST_F(AWeightedGridGraph, InitialBlockWeightsAreCorrect) {
 }
 
 TEST_F(AWeightedGridGraph, BlockWeightsAreUpdatedOnNodeMove) {
-  PartitionedGraph p_graph{create_p_graph(graph, 4, {0, 0, 1, 1, 2, 2, 3, 3})};
+  PartitionedGraph p_graph = make_p_graph(graph, 4, {0, 0, 1, 1, 2, 2, 3, 3});
+
   p_graph.set_block(0, 1);
   EXPECT_EQ(p_graph.block_weight(0), 2);
   EXPECT_EQ(p_graph.block_weight(1), 13);
@@ -134,22 +93,25 @@ TEST_F(AWeightedGridGraph, BlockWeightsAreUpdatedOnNodeMove) {
 //
 
 TEST(GraphTest, PartitionedGraphReturnsCorrectNumberOfBlocks) {
-  Graph graph{create_graph({0}, {})};
-  const PartitionedGraph p_graph{create_p_graph(&graph, 4)};
+  const Graph graph = make_graph({0}, {});
+  const PartitionedGraph p_graph = make_p_graph(graph, 4);
+
   EXPECT_EQ(p_graph.k(), 4);
 }
 
 TEST(GraphTest, InitialBlocksAreCorrect) {
-  Graph graph{create_graph({0, 0, 0, 0, 0}, {})};
-  const PartitionedGraph p_graph{create_p_graph(&graph, 4, {0, 1, 2, 3})};
+  const Graph graph = make_graph({0, 0, 0, 0, 0}, {});
+  const PartitionedGraph p_graph = make_p_graph(graph, 4, {0, 1, 2, 3});
+
   for (const NodeID u : {0, 1, 2, 3}) {
     EXPECT_EQ(p_graph.block(u), u);
   }
 }
 
 TEST(GraphTest, ChangingBlocksWorks) {
-  Graph graph{create_graph({0, 0, 0, 0, 0}, {})};
-  PartitionedGraph p_graph{create_p_graph(&graph, 4, {0, 1, 2, 3})};
+  const Graph graph = make_graph({0, 0, 0, 0, 0}, {});
+  PartitionedGraph p_graph = make_p_graph(graph, 4, {0, 1, 2, 3});
+
   p_graph.set_block(0, 1);
   EXPECT_EQ(p_graph.block(0), 1);
 }
@@ -159,17 +121,19 @@ TEST(GraphTest, ChangingBlocksWorks) {
 //
 
 TEST(GraphTest, IfBucketsAreDisabledNodesAreInFirstBucket) {
-  Graph graph{graphs::grid(4, 4)};
+  const Graph graph = make_grid_graph(4, 4);
+
   EXPECT_EQ(16, graph.bucket_size(0));
-  for (std::size_t bucket = 1; bucket < graph.number_of_buckets(); ++bucket) {
+  for (int bucket = 1; bucket < graph.number_of_buckets(); ++bucket) {
     EXPECT_EQ(0, graph.bucket_size(bucket));
   }
 }
 
 TEST(GraphTest, PutsIsolatedNodesInCorrectBucket) {
-  Graph graph{graphs::empty(10, true)};
+  Graph graph = make_empty_graph(10, true);
+
   EXPECT_EQ(10, graph.bucket_size(0));
-  for (std::size_t bucket = 1; bucket < graph.number_of_buckets(); ++bucket) {
+  for (int bucket = 1; bucket < graph.number_of_buckets(); ++bucket) {
     EXPECT_EQ(0, graph.bucket_size(bucket));
   }
   EXPECT_EQ(0, graph.first_node_in_bucket(0));
@@ -177,10 +141,11 @@ TEST(GraphTest, PutsIsolatedNodesInCorrectBucket) {
 }
 
 TEST(GraphTest, PutsMatchingInCorrectBucket) {
-  Graph graph{graphs::matching(10, true)};
+  Graph graph = make_matching_graph(10, true);
+
   EXPECT_EQ(0, graph.bucket_size(0));
   EXPECT_EQ(20, graph.bucket_size(1));
-  for (std::size_t bucket = 2; bucket < graph.number_of_buckets(); ++bucket) {
+  for (int bucket = 2; bucket < graph.number_of_buckets(); ++bucket) {
     EXPECT_EQ(0, graph.bucket_size(bucket));
   }
   EXPECT_EQ(0, graph.first_node_in_bucket(0));
@@ -197,7 +162,8 @@ TEST(GraphTest, PutsAxeInCorrectBuckets) {
    *     \ /
    *      x
    */
-  Graph graph{create_graph({0, 0, 1, 3, 5, 8, 12}, {5, 4, 5, 4, 5, 2, 3, 5, 1, 2, 3, 4}, true)};
+  Graph graph = make_graph({0, 0, 1, 3, 5, 8, 12}, {5, 4, 5, 4, 5, 2, 3, 5, 1, 2, 3, 4}, true);
+
   EXPECT_EQ(1, graph.bucket_size(0)); // deg 0
   EXPECT_EQ(1, graph.bucket_size(1)); // deg 1
   EXPECT_EQ(3, graph.bucket_size(2)); // deg 2, 3
diff --git a/tests/shm/graph_builder.h b/tests/shm/graph_builder.h
index cb44b3d5..a8ff7a4d 100644
--- a/tests/shm/graph_builder.h
+++ b/tests/shm/graph_builder.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include <numeric>
+#include <vector>
 
+#include "kaminpar-shm/datastructures/csr_graph.h"
 #include "kaminpar-shm/datastructures/graph.h"
+#include "kaminpar-shm/kaminpar.h"
 
 namespace kaminpar::shm::testing {
 class GraphBuilder {
@@ -44,72 +46,19 @@ class GraphBuilder {
 
   template <typename... Args> Graph build(Args &&...args) {
     _nodes.push_back(_edges.size());
-    return Graph(
-        static_array::create_from(_nodes),
-        static_array::create_from(_edges),
-        static_array::create_from(_node_weights),
-        static_array::create_from(_edge_weights),
+    return Graph(std::make_unique<CSRGraph>(
+        static_array::create(_nodes),
+        static_array::create(_edges),
+        static_array::create(_node_weights),
+        static_array::create(_edge_weights),
         std::forward<Args>(args)...
-    );
+    ));
   }
 
 private:
-  std::vector<EdgeID> _nodes{};
-  std::vector<NodeID> _edges{};
-  std::vector<NodeWeight> _node_weights{};
-  std::vector<EdgeWeight> _edge_weights{};
+  std::vector<EdgeID> _nodes;
+  std::vector<NodeID> _edges;
+  std::vector<NodeWeight> _node_weights;
+  std::vector<EdgeWeight> _edge_weights;
 };
-
-/*!
- * Builds a single graph that contains all graphs as induced subgraphs.
- *
- * @param graphs A list of graphs that should be copied.
- * @param connect_graphs If true, the first node of each graph is connected to a
- * clique with edges of weight 1. Otherwise, the induced subgraphs are
- * disconnected.
- * @return A single graph containing all other graphs.
- */
-inline Graph
-merge_graphs(std::initializer_list<Graph *> graphs, const bool connect_graphs = false) {
-  const NodeID n =
-      std::accumulate(graphs.begin(), graphs.end(), 0, [&](const NodeID acc, const Graph *graph) {
-        return acc + graph->n();
-      });
-  const EdgeID m =
-      std::accumulate(graphs.begin(), graphs.end(), 0, [&](const EdgeID acc, const Graph *graph) {
-        return acc + graph->m();
-      });
-  GraphBuilder builder(n, m);
-
-  NodeID offset = 0;
-  for (const Graph *graph : graphs) {
-    KASSERT(graph->n() > 0u);
-    builder.new_node(graph->node_weight(0));
-
-    if (connect_graphs) {
-      NodeID first_node_in_other_graph = 0;
-      for (const Graph *other_graph : graphs) {
-        if (other_graph == graph) {
-          continue;
-        }
-        KASSERT(other_graph->n() > 0u);
-        builder.new_edge(first_node_in_other_graph);
-        first_node_in_other_graph += other_graph->n();
-      }
-    }
-
-    for (const NodeID u : graph->nodes()) {
-      if (u > 0) {
-        builder.new_node(graph->node_weight(u));
-      }
-      for (const auto [e, v] : graph->neighbors(u)) {
-        builder.new_edge(offset + v, graph->edge_weight(e));
-      }
-    }
-
-    offset += graph->n();
-  }
-
-  return builder.build();
-}
 } // namespace kaminpar::shm::testing
diff --git a/tests/shm/graph_factories.h b/tests/shm/graph_factories.h
index 6f333782..2bc05379 100644
--- a/tests/shm/graph_factories.h
+++ b/tests/shm/graph_factories.h
@@ -4,14 +4,16 @@
 
 #include "tests/shm/graph_builder.h"
 
-namespace kaminpar::shm::testing::graphs {
+#include "kaminpar-shm/kaminpar.h"
+
+namespace kaminpar::shm::testing {
 /*!
  * Builds a graph with `n` nodes and zero edges.
  *
  * @param n Number of nodes in the graph.
  * @return Graph on `n` nodes and zero edges.
  */
-template <typename... GraphArgs> Graph empty(const NodeID n, GraphArgs &&...graph_args) {
+template <typename... GraphArgs> Graph make_empty_graph(const NodeID n, GraphArgs &&...graph_args) {
   GraphBuilder builder(n, 0);
   for (NodeID u = 0; u < n; ++u) {
     builder.new_node();
@@ -39,11 +41,7 @@ template <typename... GraphArgs> Graph empty(const NodeID n, GraphArgs &&...grap
  * @return Grid graph on `u * v` nodes.
  */
 template <typename... GraphArgs>
-Graph grid(
-    const NodeID u,
-    const NodeID v,
-    GraphArgs &&...graph_args
-) { // u x v grid
+Graph make_grid_graph(const NodeID u, const NodeID v, GraphArgs &&...graph_args) {
   GraphBuilder builder;
   for (NodeID i = 0; i < u; ++i) {
     const bool first_row = (i == 0);
@@ -76,8 +74,9 @@ Graph grid(
  * @param length Length of the path.
  * @return Path on `length` nodes.
  */
-template <typename... GraphArgs> Graph path(const NodeID length, GraphArgs &&...graph_args) {
-  return grid(length, 1, std::forward<GraphArgs...>(graph_args)...);
+template <typename... GraphArgs>
+Graph make_path_graph(const NodeID length, GraphArgs &&...graph_args) {
+  return make_grid_graph(length, 1, std::forward<GraphArgs...>(graph_args)...);
 }
 
 /*!
@@ -89,7 +88,7 @@ template <typename... GraphArgs> Graph path(const NodeID length, GraphArgs &&...
  * edges.
  */
 template <typename... GraphArgs>
-Graph complete_bipartite(const NodeID n, const NodeID m, GraphArgs &&...graph_args) {
+Graph make_complete_bipartite_graph(const NodeID n, const NodeID m, GraphArgs &&...graph_args) {
   GraphBuilder builder;
   for (NodeID u = 0; u < n; ++u) { // set A
     builder.new_node();
@@ -106,6 +105,26 @@ Graph complete_bipartite(const NodeID n, const NodeID m, GraphArgs &&...graph_ar
   return builder.build(std::forward<GraphArgs...>(graph_args)...);
 }
 
+template <typename Lambda, typename... GraphArgs>
+Graph make_complete_bipartite_graph(
+    const NodeID n, const NodeID m, Lambda &&l, GraphArgs &&...graph_args
+) {
+  GraphBuilder builder;
+  for (NodeID u = 0; u < n; ++u) { // set A
+    builder.new_node();
+    for (NodeID v = n; v < n + m; ++v) {
+      builder.new_edge(v, l(u, v));
+    }
+  }
+  for (NodeID u = n; u < n + m; ++u) { // set B
+    builder.new_node();
+    for (NodeID v = 0; v < n; ++v) {
+      builder.new_edge(v, l(u, v));
+    }
+  }
+  return builder.build(std::forward<GraphArgs...>(graph_args)...);
+}
+
 /*!
  * Builds the complete graph on `n` nodes.
  *
@@ -113,7 +132,8 @@ Graph complete_bipartite(const NodeID n, const NodeID m, GraphArgs &&...graph_ar
  * @param edge_weight Weight used for all edges.
  * @return Complete graph with `n` nodes and `n * (n - 1)` undirected edges.
  */
-template <typename... GraphArgs> Graph complete(const NodeID n, GraphArgs &&...graph_args) {
+template <typename... GraphArgs>
+Graph make_complete_graph(const NodeID n, GraphArgs &&...graph_args) {
   GraphBuilder builder;
   for (NodeID u = 0; u < n; ++u) {
     builder.new_node();
@@ -126,6 +146,20 @@ template <typename... GraphArgs> Graph complete(const NodeID n, GraphArgs &&...g
   return builder.build(std::forward<GraphArgs...>(graph_args)...);
 }
 
+template <typename Lambda, typename... GraphArgs>
+Graph make_complete_graph(const NodeID n, Lambda &&l, GraphArgs &&...graph_args) {
+  GraphBuilder builder;
+  for (NodeID u = 0; u < n; ++u) {
+    builder.new_node();
+    for (NodeID v = 0; v < n; ++v) {
+      if (u != v) {
+        builder.new_edge(v, l(u, v));
+      }
+    }
+  }
+  return builder.build(std::forward<GraphArgs...>(graph_args)...);
+}
+
 /*!
  * Builds the star graph with `n` leaves, i.e., the complete bipartite graph on
  * `(n, 1)` nodes. If nodes are weighted, the center is the heaviest node with
@@ -134,11 +168,19 @@ template <typename... GraphArgs> Graph complete(const NodeID n, GraphArgs &&...g
  * @param n Number of leaves.
  * @return Star graph with `n` leaves and one center.
  */
-template <typename... GraphArgs> Graph star(const NodeID n, GraphArgs &&...graph_args) {
-  return complete_bipartite(1, n, std::forward<GraphArgs...>(graph_args)...);
+template <typename... GraphArgs> Graph make_star_graph(const NodeID n, GraphArgs &&...graph_args) {
+  return make_complete_bipartite_graph(1, n, std::forward<GraphArgs...>(graph_args)...);
 }
 
-template <typename... GraphArgs> Graph matching(const NodeID m, GraphArgs &&...graph_args) {
+template <typename Lambda, typename... GraphArgs>
+Graph make_star_graph(const NodeID n, Lambda &&l, GraphArgs &&...graph_args) {
+  return make_complete_bipartite_graph(
+      1, n, std::forward<Lambda>(l), std::forward<GraphArgs...>(graph_args)...
+  );
+}
+
+template <typename... GraphArgs>
+Graph make_matching_graph(const NodeID m, GraphArgs &&...graph_args) {
   GraphBuilder builder;
   for (NodeID u = 0; u < 2 * m; u += 2) {
     builder.new_node();
@@ -148,4 +190,4 @@ template <typename... GraphArgs> Graph matching(const NodeID m, GraphArgs &&...g
   }
   return builder.build(std::forward<GraphArgs...>(graph_args)...);
 }
-} // namespace kaminpar::shm::testing::graphs
+} // namespace kaminpar::shm::testing
diff --git a/tests/shm/graph_helpers.h b/tests/shm/graph_helpers.h
index d1bbd519..25f28322 100644
--- a/tests/shm/graph_helpers.h
+++ b/tests/shm/graph_helpers.h
@@ -1,82 +1,66 @@
 #pragma once
 
-#include <gmock/gmock.h>
+#include <vector>
 
-#include "tests/shm/graph_builder.h"
+#include <gmock/gmock.h>
 
-#include "kaminpar-shm/context.h"
 #include "kaminpar-shm/datastructures/graph.h"
 #include "kaminpar-shm/datastructures/partitioned_graph.h"
+#include "kaminpar-shm/kaminpar.h"
 
-#include "kaminpar-common/assert.h"
-#include "kaminpar-common/datastructures/scalable_vector.h"
+#include "kaminpar-common/datastructures/static_array.h"
 
 namespace kaminpar::shm::testing {
-namespace graphs {
-inline PartitionedGraph p_graph(const Graph &graph, const BlockID k) {
-  return PartitionedGraph(graph, k, static_array::create_from(std::vector<BlockID>(graph.n())));
-}
-} // namespace graphs
-
 //
 // Convenience functions to create Graph / PartitionedGraph from initializer
 // lists
 //
 
-inline Graph create_graph(
+inline Graph make_graph(
     const std::vector<EdgeID> &nodes, const std::vector<NodeID> &edges, const bool sorted = false
 ) {
-  return Graph{static_array::create_from(nodes), static_array::create_from(edges), {}, {}, sorted};
+  return Graph(std::make_unique<CSRGraph>(
+      static_array::create(nodes),
+      static_array::create(edges),
+      StaticArray<NodeWeight>(),
+      StaticArray<EdgeWeight>(),
+      sorted
+  ));
 }
 
-inline Graph create_graph(
+inline Graph make_graph(
     const std::vector<EdgeID> &nodes,
     const std::vector<NodeID> &edges,
     const std::vector<NodeWeight> &node_weights,
     const std::vector<EdgeWeight> &edge_weights,
     const bool sorted = false
 ) {
-  return Graph{
-      static_array::create_from(nodes),
-      static_array::create_from(edges),
-      static_array::create_from(node_weights),
-      static_array::create_from(edge_weights),
-      sorted};
-}
-
-inline PartitionedGraph
-create_p_graph(const Graph &graph, const BlockID k, const std::vector<BlockID> &partition) {
-  return PartitionedGraph{graph, k, static_array::create_from(partition)};
+  return Graph(std::make_unique<CSRGraph>(
+      static_array::create(nodes),
+      static_array::create(edges),
+      static_array::create(node_weights),
+      static_array::create(edge_weights),
+      sorted
+  ));
 }
 
 inline PartitionedGraph
-create_p_graph(const Graph *graph, const BlockID k, const std::vector<BlockID> &partition) {
-  return create_p_graph(*graph, k, partition);
+make_p_graph(const Graph &graph, const BlockID k, const std::vector<BlockID> &partition) {
+  return PartitionedGraph(graph, k, static_array::create<BlockID>(partition));
 }
 
-inline PartitionedGraph create_p_graph(const Graph &graph, const BlockID k) {
-  return PartitionedGraph{graph, k};
-}
-
-inline PartitionedGraph create_p_graph(const Graph *graph, const BlockID k) {
-  return create_p_graph(*graph, k);
-}
-
-template <typename T> StaticArray<T> create_static_array(const std::vector<T> &elements) {
-  StaticArray<T> arr(elements.size());
-  for (std::size_t i = 0; i < elements.size(); ++i) {
-    arr[i] = elements[i];
-  }
-  return arr;
+inline PartitionedGraph make_p_graph(const Graph &graph, const BlockID k) {
+  return PartitionedGraph(graph, k);
 }
 
 inline EdgeID find_edge_by_endpoints(const Graph &graph, const NodeID u, const NodeID v) {
-  for (const auto [e, v_prime] : graph.neighbors(u)) {
-    if (v == v_prime) {
-      return e;
+  EdgeID ans = kInvalidEdgeID;
+  graph.neighbors(u, [&](const EdgeID e, const NodeID v_prime) {
+    if (ans == kInvalidEdgeID && v == v_prime) {
+      ans = e;
     }
-  }
-  return kInvalidEdgeID;
+  });
+  return ans;
 }
 
 inline std::vector<NodeID> degrees(const Graph &graph) {
@@ -87,83 +71,9 @@ inline std::vector<NodeID> degrees(const Graph &graph) {
   return degrees;
 }
 
-inline Graph change_node_weight(Graph graph, const NodeID u, const NodeWeight new_node_weight) {
-  auto node_weights = graph.take_raw_node_weights();
+inline void change_node_weight(Graph &graph, const NodeID u, const NodeWeight new_node_weight) {
+  auto &raw_graph = *dynamic_cast<CSRGraph *>(graph.underlying_graph());
+  auto &node_weights = raw_graph.raw_node_weights();
   node_weights[u] = new_node_weight;
-  return Graph{
-      graph.take_raw_nodes(),
-      graph.take_raw_edges(),
-      std::move(node_weights),
-      graph.take_raw_edge_weights(),
-      graph.sorted()};
-}
-
-inline Graph
-change_edge_weight(Graph graph, const NodeID u, const NodeID v, const EdgeWeight new_edge_weight) {
-  const EdgeID forward_edge = find_edge_by_endpoints(graph, u, v);
-  const EdgeID backward_edge = find_edge_by_endpoints(graph, v, u);
-  KASSERT(forward_edge != kInvalidEdgeID);
-  KASSERT(backward_edge != kInvalidEdgeID);
-
-  auto edge_weights = graph.take_raw_edge_weights();
-  KASSERT(edge_weights[forward_edge] == edge_weights[backward_edge]);
-
-  edge_weights[forward_edge] = new_edge_weight;
-  edge_weights[backward_edge] = new_edge_weight;
-
-  return Graph{
-      graph.take_raw_nodes(),
-      graph.take_raw_edges(),
-      graph.take_raw_node_weights(),
-      std::move(edge_weights),
-      graph.sorted()};
-}
-
-inline Graph assign_exponential_weights(
-    Graph graph, const bool assign_node_weights, const bool assign_edge_weights
-) {
-  KASSERT(
-      !assign_node_weights || graph.n() <= std::numeric_limits<NodeWeight>::digits -
-                                               std::numeric_limits<NodeWeight>::is_signed,
-      "Cannot assign exponential node weights: graph has too many nodes",
-      assert::always
-  );
-  KASSERT(
-      !assign_edge_weights || graph.m() <= std::numeric_limits<EdgeWeight>::digits -
-                                               std::numeric_limits<EdgeWeight>::is_signed,
-      "Cannot assign exponential edge weights: graph has too many edges",
-      assert::always
-  );
-
-  auto node_weights = graph.take_raw_node_weights();
-  if (assign_node_weights) {
-    for (const NodeID u : graph.nodes()) {
-      node_weights[u] = 1 << u;
-    }
-  }
-
-  auto edge_weights = graph.take_raw_edge_weights();
-  if (assign_edge_weights) {
-    for (const NodeID u : graph.nodes()) {
-      for (const auto [e, v] : graph.neighbors(u)) {
-        if (v > u) {
-          continue;
-        }
-        edge_weights[e] = 1 << e;
-        for (const auto [e_prime, u_prime] : graph.neighbors(v)) {
-          if (u == u_prime) {
-            edge_weights[e_prime] = edge_weights[e];
-          }
-        }
-      }
-    }
-  }
-
-  return Graph{
-      graph.take_raw_nodes(),
-      graph.take_raw_edges(),
-      std::move(node_weights),
-      std::move(edge_weights),
-      graph.sorted()};
 }
 } // namespace kaminpar::shm::testing
diff --git a/tests/shm/graph_utils_test.cc b/tests/shm/graph_utils_test.cc
deleted file mode 100644
index fce2d2a0..00000000
--- a/tests/shm/graph_utils_test.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-#include "tests/shm/graph_factories.h"
-#include "tests/shm/graph_helpers.h"
-#include "tests/shm/matchers.h"
-#include "tests/shm/test_helpers.h"
-
-#include "kaminpar-shm/graphutils/cluster_contraction.h"
-#include "kaminpar-shm/graphutils/permutator.h"
-#include "kaminpar-shm/graphutils/subgraph_extractor.h"
-
-using ::testing::AllOf;
-using ::testing::AnyOf;
-using ::testing::Ge;
-using ::testing::Gt;
-using ::testing::Le;
-using ::testing::Lt;
-using ::testing::UnorderedElementsAre;
-
-namespace kaminpar::shm::testing {
-TEST(ParallelContractionTest, ContractingToSingleNodeWorks) {
-  static constexpr auto GRID_LENGTH{2};
-  Graph graph{graphs::grid(GRID_LENGTH, GRID_LENGTH)};
-
-  for (const NodeID cluster : {0, 1, 2, 3}) {
-    auto [c_graph, c_mapping, m_ctx] =
-        graph::contract(graph, scalable_vector<NodeID>{cluster, cluster, cluster, cluster});
-    EXPECT_THAT(c_graph.n(), 1);
-    EXPECT_THAT(c_graph.m(), 0);
-    EXPECT_THAT(c_graph.node_weight(0), graph.total_node_weight());
-  }
-}
-
-TEST(ParallelContractionTest, ContractingToSingletonsWorks) {
-  static constexpr auto GRID_LENGTH{2};
-  Graph graph{graphs::grid(GRID_LENGTH, GRID_LENGTH)};
-  graph = change_node_weight(std::move(graph), 0, 1);
-  graph = change_node_weight(std::move(graph), 1, 2);
-  graph = change_node_weight(std::move(graph), 2, 3);
-  graph = change_node_weight(std::move(graph), 3, 4);
-
-  auto [c_graph, c_mapping, m_ctx] = graph::contract(graph, scalable_vector<NodeID>{0, 1, 2, 3});
-  EXPECT_THAT(c_graph.n(), graph.n());
-  EXPECT_THAT(c_graph.m(), graph.m());
-  EXPECT_THAT(c_graph.total_node_weight(), graph.total_node_weight());
-  EXPECT_THAT(c_graph.total_edge_weight(), graph.total_edge_weight());
-
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(1, 2));
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(1, 3));
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(2, 4));
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(3, 4));
-}
-
-TEST(ParallelContractionTest, ContractingAllNodesButOneWorks) {
-  static constexpr auto GRID_LENGTH{2};
-  Graph graph{graphs::grid(GRID_LENGTH, GRID_LENGTH)};
-
-  // 0--1
-  // |  |
-  // 2--3
-  auto [c_graph, c_mapping, m_ctx] = graph::contract(graph, scalable_vector<NodeID>{0, 1, 1, 1});
-  EXPECT_THAT(c_graph.n(), 2);
-  EXPECT_THAT(c_graph.m(), 2); // one undirected edge
-  EXPECT_THAT(c_graph.total_node_weight(), graph.total_node_weight());
-  EXPECT_THAT(c_graph.total_edge_weight(), 2 * 2);
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(1, 3));
-}
-
-TEST(ParallelContractionTest, ContractingGridHorizontallyWorks) {
-  Graph graph{graphs::grid(2, 4)}; // two rows, 4 columns, organized row by row
-  graph = change_node_weight(std::move(graph), 0, 1);
-  graph = change_node_weight(std::move(graph), 1, 2);
-  graph = change_node_weight(std::move(graph), 2, 3);
-  graph = change_node_weight(std::move(graph), 3, 4);
-  graph = change_node_weight(std::move(graph), 4, 10);
-  graph = change_node_weight(std::move(graph), 5, 20);
-  graph = change_node_weight(std::move(graph), 6, 30);
-  graph = change_node_weight(std::move(graph), 7, 40);
-
-  auto [c_graph, c_mapping, m_ctx] =
-      graph::contract(graph, scalable_vector<NodeID>{0, 1, 2, 3, 0, 1, 2, 3});
-  EXPECT_THAT(c_graph.n(), 4);
-  EXPECT_THAT(c_graph.m(), 2 * 3);
-  EXPECT_THAT(c_graph.raw_node_weights(), UnorderedElementsAre(11, 22, 33, 44));
-  EXPECT_THAT(c_graph.total_node_weight(), graph.total_node_weight());
-  EXPECT_THAT(c_graph.total_edge_weight(), 4 * 3);
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(11, 22));
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(22, 33));
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(33, 44));
-}
-
-TEST(ParallelContractionTest, ContractingGridVerticallyWorks) {
-  Graph graph{graphs::grid(4, 2)}; // four columns, two rows, organized row by row
-  graph = change_node_weight(std::move(graph), 0, 1);
-  graph = change_node_weight(std::move(graph), 1, 10);
-  graph = change_node_weight(std::move(graph), 2, 2);
-  graph = change_node_weight(std::move(graph), 3, 20);
-  graph = change_node_weight(std::move(graph), 4, 3);
-  graph = change_node_weight(std::move(graph), 5, 30);
-  graph = change_node_weight(std::move(graph), 6, 4);
-  graph = change_node_weight(std::move(graph), 7, 40);
-
-  auto [c_graph, c_mapping, m_ctx] =
-      graph::contract(graph, scalable_vector<NodeID>{0, 0, 2, 2, 4, 4, 6, 6});
-  EXPECT_THAT(c_graph.n(), 4);
-  EXPECT_THAT(c_graph.m(), 2 * 3);
-  EXPECT_THAT(c_graph.raw_node_weights(), UnorderedElementsAre(11, 22, 33, 44));
-  EXPECT_THAT(c_graph.total_node_weight(), graph.total_node_weight());
-  EXPECT_THAT(c_graph.total_edge_weight(), 4 * 3);
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(11, 22));
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(22, 33));
-  EXPECT_THAT(c_graph, HasEdgeWithWeightedEndpoints(33, 44));
-}
-
-//
-// Pseudo-sorting the nodes of a graph by degree
-//
-
-TEST(GraphPermutationTest, PermutationByNodeDegreeIsCorrect) {
-  // 5 3
-  //   |
-  // 1-2-0
-  //   |/
-  //   4
-  const StaticArray<EdgeID> nodes = create_static_array<EdgeID>({0, 2, 3, 7, 8, 10, 10});
-
-  const auto permutations = graph::sort_by_degree_buckets(nodes);
-  const auto &permutation = permutations.old_to_new;
-  EXPECT_THAT(permutation[0], AllOf(Ge(2), Le(3)));
-  EXPECT_THAT(permutation[1], AllOf(Ge(0), Le(1)));
-  EXPECT_EQ(permutation[2], 4);
-  EXPECT_THAT(permutation[3], AllOf(Ge(0), Le(1)));
-  EXPECT_THAT(permutation[4], AllOf(Ge(2), Le(3)));
-  EXPECT_EQ(permutation[5], 5);
-}
-
-TEST(GraphPermutationTest, MovingIsolatedNodesToBackWorks) {
-  // node 0 1 2 3 4 5 6 7 8 9 10
-  // deg  0 0 1 1 1 0 0 1 1 0 0
-  const StaticArray<EdgeID> nodes =
-      create_static_array<EdgeID>({0, 0, 0, 1, 2, 3, 3, 3, 4, 5, 5, 5});
-  const auto permutations = graph::sort_by_degree_buckets(nodes);
-  const auto &permutation = permutations.old_to_new;
-
-  EXPECT_GE(permutation[0], 5);
-  EXPECT_GE(permutation[1], 5);
-  EXPECT_LE(permutation[2], 4);
-  EXPECT_LE(permutation[3], 4);
-  EXPECT_LE(permutation[4], 4);
-  EXPECT_GE(permutation[5], 5);
-  EXPECT_GE(permutation[6], 5);
-  EXPECT_LE(permutation[7], 4);
-  EXPECT_LE(permutation[8], 4);
-  EXPECT_GE(permutation[9], 5);
-  EXPECT_GE(permutation[10], 5);
-}
-
-//
-// Preprocessing
-//
-
-TEST(
-    PreprocessingTest, PreprocessingFacadeRemovesIsolatedNodesAndAdaptsEpsilonFromUnweightedGraph
-) {
-  /* 0
-   * 1--2--3        *--*--*
-   * 4  5  6    --> *
-   * |              |
-   * 7--8  9        *--*--*
-   * 10    11
-   */
-  auto nodes = create_static_array<EdgeID>({0, 0, 1, 3, 4, 5, 5, 5, 7, 8, 8, 8, 8});
-  auto edges = create_static_array<NodeID>({2, 1, 3, 2, 7, 4, 8, 7});
-  auto node_weights = create_static_array<NodeWeight>({});
-  auto edge_weights = create_static_array<EdgeWeight>({});
-
-  PartitionContext p_ctx;
-  p_ctx.k = 2;
-  p_ctx.epsilon = 0.17; // max block weight 7
-
-  graph::rearrange_graph(p_ctx, nodes, edges, node_weights, edge_weights);
-
-  EXPECT_EQ(nodes.size(), 7);
-  EXPECT_EQ(edges.size(), 8);
-  for (const NodeID v : edges) {
-    EXPECT_LT(v, 7);
-  } // edges are valid
-
-  // total weight of new graph: 6, perfectly balanced block weight: 3
-  // hence eps' should be 1.3333....
-  EXPECT_THAT(p_ctx.epsilon, AllOf(Gt(1.33), Lt(1.34)));
-}
-
-//
-// Sequential graph extraction
-//
-TEST(SequentialGraphExtraction, SimpleSequentialBipartitionExtractionWorks) {
-  // 0--1--2     block 0
-  //-|--|--
-  // 3--4--5     block 1
-  Graph graph{create_graph({0, 2, 5, 6, 8, 11, 12}, {1, 3, 0, 4, 2, 1, 0, 4, 3, 1, 5, 4})};
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 0, 0, 1, 1, 1})};
-
-  graph::SubgraphMemory memory{p_graph};
-  graph::SubgraphMemoryStartPosition position{0, 0};
-  graph::TemporarySubgraphMemory buffer{};
-  const auto [subgraphs, positions] =
-      graph::extract_subgraphs_sequential(p_graph, {1, 1}, position, memory, buffer);
-
-  for (const auto &subgraph : subgraphs) {
-    EXPECT_EQ(subgraph.n(), 3);
-    EXPECT_EQ(subgraph.m(), 4);
-    EXPECT_THAT(degrees(subgraph), UnorderedElementsAre(1, 1, 2));
-  }
-
-  EXPECT_EQ(positions[0].nodes_start_pos, 0);
-  EXPECT_EQ(positions[0].edges_start_pos, 0);
-  EXPECT_EQ(positions[1].nodes_start_pos, 4);
-  EXPECT_EQ(positions[1].edges_start_pos, 4);
-}
-} // namespace kaminpar::shm::testing
diff --git a/tests/shm/partition_utils_test.cc b/tests/shm/graphutils/partition_utils_test.cc
similarity index 99%
rename from tests/shm/partition_utils_test.cc
rename to tests/shm/graphutils/partition_utils_test.cc
index 78d0f3a8..563f961e 100644
--- a/tests/shm/partition_utils_test.cc
+++ b/tests/shm/graphutils/partition_utils_test.cc
@@ -1,3 +1,5 @@
+#include <cmath>
+
 #include <gtest/gtest.h>
 
 #include "kaminpar-shm/partition_utils.h"
diff --git a/tests/shm/graphutils/subgraph_extraction_test.cc b/tests/shm/graphutils/subgraph_extraction_test.cc
new file mode 100644
index 00000000..b3bfd3b0
--- /dev/null
+++ b/tests/shm/graphutils/subgraph_extraction_test.cc
@@ -0,0 +1,160 @@
+#include "tests/shm/graph_helpers.h"
+#include "tests/shm/matchers.h"
+
+#include "kaminpar-shm/graphutils/subgraph_extractor.h"
+
+using ::testing::AnyOf;
+
+namespace kaminpar::shm::testing {
+Graph make_weighted_grid_graph() {
+  return make_graph(
+      {0, 2, 6, 10, 13, 16, 20, 24, 26},
+      {1, 4, 0, 4, 5, 2, 1, 5, 6, 3, 2, 6, 7, 0, 1, 5, 4, 1, 2, 6, 5, 2, 3, 7, 6, 3},
+      {1, 2, 4, 8, 16, 32, 64, 128},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+  );
+}
+
+TEST(SubgraphExtractionTest, ExtractingBlockFromWeightedGridGraphHorizontally) {
+  const Graph graph = make_weighted_grid_graph();
+  PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 0, 0, 0, 1, 1, 1, 1});
+
+  graph::SubgraphMemory memory(p_graph);
+  const auto [subgraphs, node_mapping, positions] = graph::extract_subgraphs(p_graph, 2, memory);
+  const auto &s_graph0 = subgraphs[0];
+  const auto &s_graph1 = subgraphs[1];
+
+  EXPECT_EQ(s_graph0.n(), 4);
+  EXPECT_EQ(s_graph0.m(), 6);
+  EXPECT_EQ(s_graph1.n(), 4);
+  EXPECT_EQ(s_graph1.m(), 6);
+
+  EXPECT_THAT(s_graph0, HasEdgeWithWeightedEndpoints(1, 2));
+  EXPECT_THAT(s_graph0, HasEdgeWithWeightedEndpoints(2, 4));
+  EXPECT_THAT(s_graph0, HasEdgeWithWeightedEndpoints(4, 8));
+  EXPECT_THAT(s_graph1, HasEdgeWithWeightedEndpoints(16, 32));
+  EXPECT_THAT(s_graph1, HasEdgeWithWeightedEndpoints(32, 64));
+  EXPECT_THAT(s_graph1, HasEdgeWithWeightedEndpoints(64, 128));
+}
+
+TEST(SubgraphExtractionTest, ExtractingEmptyBlockFromWeightedGridGraph) {
+  const Graph graph = make_weighted_grid_graph();
+  PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 0, 0, 0, 0, 0, 0, 0});
+
+  graph::SubgraphMemory memory(p_graph);
+  const auto [subgraphs, node_mapping, positions] = graph::extract_subgraphs(p_graph, 2, memory);
+  const auto &s_graph0 = subgraphs[0];
+  const auto &s_graph1 = subgraphs[1];
+
+  EXPECT_EQ(s_graph0.n(), graph.n());
+  EXPECT_EQ(s_graph0.m(), graph.m());
+  EXPECT_EQ(s_graph1.n(), 0);
+  EXPECT_EQ(s_graph1.m(), 0);
+}
+
+TEST(SubgraphExtractionTest, ExtractsIsolatedNodes) {
+  const Graph graph = make_graph({0, 0, 0, 0, 0}, {});
+  PartitionedGraph p_graph = make_p_graph(graph, 4, {0, 1, 2, 3});
+
+  graph::SubgraphMemory memory(p_graph);
+  auto result = graph::extract_subgraphs(p_graph, 4, memory);
+
+  EXPECT_EQ(result.subgraphs[0].n(), 1);
+  EXPECT_EQ(result.subgraphs[1].n(), 1);
+  EXPECT_EQ(result.subgraphs[2].n(), 1);
+  EXPECT_EQ(result.subgraphs[3].n(), 1);
+  EXPECT_EQ(result.subgraphs[0].m(), 0);
+  EXPECT_EQ(result.subgraphs[1].m(), 0);
+  EXPECT_EQ(result.subgraphs[2].m(), 0);
+  EXPECT_EQ(result.subgraphs[3].m(), 0);
+}
+
+TEST(SubgraphExtractionTest, ExtractsEdges) {
+  const Graph graph = make_graph({0, 1, 2, 3, 4}, {1, 0, 3, 2});
+  PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 0, 1, 1});
+
+  graph::SubgraphMemory memory(p_graph);
+  auto result = graph::extract_subgraphs(p_graph, 2, memory);
+
+  CSRGraph &subgraph0 = *dynamic_cast<CSRGraph *>(result.subgraphs[0].underlying_graph());
+  CSRGraph &subgraph1 = *dynamic_cast<CSRGraph *>(result.subgraphs[1].underlying_graph());
+
+  EXPECT_EQ(subgraph0.n(), 2);
+  EXPECT_EQ(subgraph1.n(), 2);
+  EXPECT_EQ(subgraph0.m(), 2);
+  EXPECT_EQ(subgraph1.m(), 2);
+  EXPECT_THAT(subgraph0.raw_edges()[0], AnyOf(0, 1));
+  EXPECT_THAT(subgraph0.raw_edges()[1], AnyOf(1, 0));
+  EXPECT_NE(subgraph0.raw_edges()[0], subgraph0.raw_edges()[1]);
+  EXPECT_THAT(subgraph1.raw_edges()[0], AnyOf(0, 1));
+  EXPECT_THAT(subgraph1.raw_edges()[1], AnyOf(1, 0));
+  EXPECT_NE(subgraph1.raw_edges()[0], subgraph0.raw_edges()[1]);
+}
+
+TEST(SubgraphExtractionTest, ExtractsPathCutInTwo) {
+  const Graph graph = make_graph({0, 1, 3, 5, 6}, {1, 0, 2, 1, 3, 2});
+  PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 0, 1, 1});
+
+  graph::SubgraphMemory memory(p_graph);
+  auto result = graph::extract_subgraphs(p_graph, 2, memory);
+
+  CSRGraph &subgraph0 = *dynamic_cast<CSRGraph *>(result.subgraphs[0].underlying_graph());
+  CSRGraph &subgraph1 = *dynamic_cast<CSRGraph *>(result.subgraphs[1].underlying_graph());
+
+  EXPECT_EQ(subgraph0.n(), 2);
+  EXPECT_EQ(subgraph1.n(), 2);
+  EXPECT_EQ(subgraph0.m(), 2);
+  EXPECT_EQ(subgraph1.m(), 2);
+  EXPECT_THAT(subgraph0.raw_edges()[0], AnyOf(0, 1));
+  EXPECT_THAT(subgraph0.raw_edges()[1], AnyOf(1, 0));
+  EXPECT_NE(subgraph0.raw_edges()[0], subgraph0.raw_edges()[1]);
+  EXPECT_THAT(subgraph1.raw_edges()[0], AnyOf(0, 1));
+  EXPECT_THAT(subgraph1.raw_edges()[1], AnyOf(1, 0));
+  EXPECT_NE(subgraph1.raw_edges()[0], subgraph0.raw_edges()[1]);
+}
+
+TEST(SubgraphExtractionTest, ComplexTrianglesWeightedExampleWorks) {
+  // 0 -- 1 -- 2 -- 3
+  //  \  /      \  /
+  //   4 -------- 5
+  //    \        /
+  //     6 ---- 7
+  //      \    /
+  //         8
+  // weights shifted by one
+  // edges weight = sum of incident node weights
+  // each triangle in one block
+  const Graph graph = make_graph(
+      {0, 2, 5, 8, 10, 14, 18, 21, 24, 26},
+      {1, 4, 0, 2, 4, 1, 3, 5, 2, 5, 0, 1, 5, 6, 2, 3, 4, 7, 4, 7, 8, 5, 6, 8, 6, 7},
+      {1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {3, 6, 3, 5, 7, 5, 7, 9, 7, 10, 6, 7, 11, 12, 9, 10, 11, 14, 12, 15, 16, 14, 15, 17, 16, 17}
+  );
+  PartitionedGraph p_graph = make_p_graph(graph, 3, {0, 0, 1, 1, 0, 1, 2, 2, 2});
+
+  graph::SubgraphMemory memory(
+      p_graph.n(), 15, p_graph.m(), p_graph.graph().node_weighted(), p_graph.graph().edge_weighted()
+  );
+  auto result = graph::extract_subgraphs(p_graph, 3, memory);
+
+  EXPECT_EQ(result.subgraphs[0].n(), 3);
+  EXPECT_EQ(result.subgraphs[1].n(), 3);
+  EXPECT_EQ(result.subgraphs[2].n(), 3);
+  EXPECT_EQ(result.subgraphs[0].m(), 6);
+  EXPECT_EQ(result.subgraphs[1].m(), 6);
+  EXPECT_EQ(result.subgraphs[2].m(), 6);
+
+  EXPECT_THAT(result.subgraphs[0], HasEdgeWithWeightedEndpoints(1, 2));
+  EXPECT_THAT(result.subgraphs[0], HasEdgeWithWeightedEndpoints(1, 5));
+  EXPECT_THAT(result.subgraphs[0], HasEdgeWithWeightedEndpoints(2, 5));
+  EXPECT_EQ(result.subgraphs[0].total_edge_weight(), 32);
+
+  EXPECT_THAT(result.subgraphs[1], HasEdgeWithWeightedEndpoints(3, 4));
+  EXPECT_THAT(result.subgraphs[1], HasEdgeWithWeightedEndpoints(3, 6));
+  EXPECT_THAT(result.subgraphs[1], HasEdgeWithWeightedEndpoints(4, 6));
+
+  EXPECT_THAT(result.subgraphs[2], HasEdgeWithWeightedEndpoints(7, 8));
+  EXPECT_THAT(result.subgraphs[2], HasEdgeWithWeightedEndpoints(7, 9));
+  EXPECT_THAT(result.subgraphs[2], HasEdgeWithWeightedEndpoints(8, 9));
+}
+} // namespace kaminpar::shm::testing
diff --git a/tests/shm/initial_partitioning/initial_coarsener_test.cc b/tests/shm/initial_partitioning/initial_coarsener_test.cc
index 1caf10ac..94cc8453 100644
--- a/tests/shm/initial_partitioning/initial_coarsener_test.cc
+++ b/tests/shm/initial_partitioning/initial_coarsener_test.cc
@@ -35,10 +35,10 @@ TEST(InitialCoarsenerTest, ContractingToSingleNodeWorks) {
 TEST(InitialCoarsenerTest, ContractingToSingletonsWorks) {
   static constexpr auto GRID_LENGTH{2};
   Graph graph{graphs::grid(GRID_LENGTH, GRID_LENGTH)};
-  graph = change_node_weight(std::move(graph), 0, 1);
-  graph = change_node_weight(std::move(graph), 1, 2);
-  graph = change_node_weight(std::move(graph), 2, 3);
-  graph = change_node_weight(std::move(graph), 3, 4);
+  change_node_weight(graph, 0, 1);
+  change_node_weight(graph, 1, 2);
+  change_node_weight(graph, 2, 3);
+  change_node_weight(graph, 3, 4);
 
   InitialCoarsener coarsener{&graph, create_default_context().coarsening};
   coarsener.TEST_mock_clustering({0, 1, 2, 3});
@@ -76,14 +76,14 @@ TEST(InitialCoarsenerTest, ContractingAllNodesButOneWorks) {
 
 TEST(InitialCoarsenerTest, ContractingGridHorizontallyWorks) {
   Graph graph{graphs::grid(2, 4)}; // two rows, 4 columns, organized row by row
-  graph = change_node_weight(std::move(graph), 0, 1);
-  graph = change_node_weight(std::move(graph), 1, 2);
-  graph = change_node_weight(std::move(graph), 2, 3);
-  graph = change_node_weight(std::move(graph), 3, 4);
-  graph = change_node_weight(std::move(graph), 4, 10);
-  graph = change_node_weight(std::move(graph), 5, 20);
-  graph = change_node_weight(std::move(graph), 6, 30);
-  graph = change_node_weight(std::move(graph), 7, 40);
+  change_node_weight(graph, 0, 1);
+  change_node_weight(graph, 1, 2);
+  change_node_weight(graph, 2, 3);
+  change_node_weight(graph, 3, 4);
+  change_node_weight(graph, 4, 10);
+  change_node_weight(graph, 5, 20);
+  change_node_weight(graph, 6, 30);
+  change_node_weight(graph, 7, 40);
 
   InitialCoarsener coarsener{&graph, create_default_context().coarsening};
   coarsener.TEST_mock_clustering({0, 1, 2, 3, 0, 1, 2, 3});
@@ -101,14 +101,14 @@ TEST(InitialCoarsenerTest, ContractingGridHorizontallyWorks) {
 
 TEST(InitialCoarsenerTest, ContractingGridVerticallyWorks) {
   Graph graph{graphs::grid(4, 2)}; // four columns, two rows, organized row by row
-  graph = change_node_weight(std::move(graph), 0, 1);
-  graph = change_node_weight(std::move(graph), 1, 10);
-  graph = change_node_weight(std::move(graph), 2, 2);
-  graph = change_node_weight(std::move(graph), 3, 20);
-  graph = change_node_weight(std::move(graph), 4, 3);
-  graph = change_node_weight(std::move(graph), 5, 30);
-  graph = change_node_weight(std::move(graph), 6, 4);
-  graph = change_node_weight(std::move(graph), 7, 40);
+  change_node_weight(graph, 0, 1);
+  change_node_weight(graph, 1, 10);
+  change_node_weight(graph, 2, 2);
+  change_node_weight(graph, 3, 20);
+  change_node_weight(graph, 4, 3);
+  change_node_weight(graph, 5, 30);
+  change_node_weight(graph, 6, 4);
+  change_node_weight(graph, 7, 40);
 
   InitialCoarsener coarsener{&graph, create_default_context().coarsening};
   coarsener.TEST_mock_clustering({0, 0, 2, 2, 4, 4, 6, 6});
diff --git a/tests/shm/metrics_test.cc b/tests/shm/metrics_test.cc
index a22405f7..e4371262 100644
--- a/tests/shm/metrics_test.cc
+++ b/tests/shm/metrics_test.cc
@@ -8,16 +8,13 @@
 namespace kaminpar::shm::testing {
 class MetricsTestFixture : public ::testing::Test {
 public:
-  MetricsTestFixture()
-      : graph(create_graph(
-            {0, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 0, 0, 0, 0}, {4, 1, 1, 1, 1}, {3, 3, 3, 3, 3, 3, 3, 3}
-        )) {}
-
-  Graph graph;
+  Graph graph = make_graph(
+      {0, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 0, 0, 0, 0}, {4, 1, 1, 1, 1}, {3, 3, 3, 3, 3, 3, 3, 3}
+  );
 };
 
 TEST_F(MetricsTestFixture, parallel_bipartition_edge_cut) {
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 1, 1, 1, 1})};
+  PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 1, 1, 1, 1});
   EXPECT_EQ(metrics::edge_cut(p_graph), 4 * 3);
 
   // star center to other block, should reduce the edge cut to 0
@@ -32,7 +29,7 @@ TEST_F(MetricsTestFixture, parallel_bipartition_edge_cut) {
 }
 
 TEST_F(MetricsTestFixture, sequential_bipartition_edge_cut) {
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 1, 1, 1, 1})};
+  PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 1, 1, 1, 1});
   EXPECT_EQ(metrics::edge_cut_seq(p_graph), 4 * 3);
 
   // star center to other block, should reduce the edge cut to 0
@@ -47,22 +44,22 @@ TEST_F(MetricsTestFixture, sequential_bipartition_edge_cut) {
 }
 
 TEST_F(MetricsTestFixture, parallel_singleton_blocks_edge_cut) {
-  PartitionedGraph p_graph{create_p_graph(graph, 5, {0, 1, 2, 3, 4})};
+  const PartitionedGraph p_graph = make_p_graph(graph, 5, {0, 1, 2, 3, 4});
   EXPECT_EQ(metrics::edge_cut(p_graph), 4 * 3);
 }
 
 TEST_F(MetricsTestFixture, sequential_singleton_blocks_edge_cut) {
-  PartitionedGraph p_graph{create_p_graph(graph, 5, {0, 1, 2, 3, 4})};
+  const PartitionedGraph p_graph = make_p_graph(graph, 5, {0, 1, 2, 3, 4});
   EXPECT_EQ(metrics::edge_cut_seq(p_graph), 4 * 3);
 }
 
 TEST_F(MetricsTestFixture, perfectly_balanced_bipartition_balance) {
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 1, 1, 1, 1})};
+  const PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 1, 1, 1, 1});
   EXPECT_DOUBLE_EQ(metrics::imbalance(p_graph), 0.0);
 }
 
 TEST_F(MetricsTestFixture, imbalanced_bipartition_balance) {
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 0, 0, 1, 1})};
+  const PartitionedGraph p_graph = make_p_graph(graph, 2, {0, 0, 0, 1, 1});
   // block weights:
   // weight(0) = 6
   // weight(1) = 2
@@ -81,16 +78,16 @@ create_testing_context(const Graph &graph, const BlockID k = 2, const double eps
 }
 
 TEST(MetricsTest, is_feasible_with_single_node) {
-  Graph graph{create_graph({0, 0}, {}, {1000}, {})};
-  const PartitionedGraph p_graph{create_p_graph(graph, 1, {0})};
+  const Graph graph = make_graph({0, 0}, {}, {1000}, {});
+  const PartitionedGraph p_graph = make_p_graph(graph, 1, {0});
   Context ctx = create_testing_context(graph, 1, 0.03);
 
   EXPECT_TRUE(metrics::is_feasible(p_graph, ctx.partition));
 }
 
 TEST(MetricsTest, is_feasible_with_multiple_nodes) {
-  Graph graph{create_graph({0, 0, 0, 0, 0}, {}, {200, 100, 100, 100}, {})};
-  PartitionedGraph p_graph{create_p_graph(graph, 4, {0, 1, 2, 3})};
+  const Graph graph = make_graph({0, 0, 0, 0, 0}, {}, {200, 100, 100, 100}, {});
+  PartitionedGraph p_graph = make_p_graph(graph, 4, {0, 1, 2, 3});
   Context ctx = create_testing_context(graph, 4, 0);
 
   EXPECT_TRUE(metrics::is_feasible(p_graph, ctx.partition));
diff --git a/tests/shm/subgraph_extraction_test.cc b/tests/shm/subgraph_extraction_test.cc
deleted file mode 100644
index ff13269f..00000000
--- a/tests/shm/subgraph_extraction_test.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-#include "tests/shm/graph_factories.h"
-#include "tests/shm/graph_helpers.h"
-#include "tests/shm/matchers.h"
-#include "tests/shm/test_helpers.h"
-
-#include "kaminpar-shm/graphutils/subgraph_extractor.h"
-
-using ::testing::AnyOf;
-
-namespace kaminpar::shm::testing {
-TEST(SubgraphExtractionTest, ExtractsIsolatedNodes) {
-  Graph graph{create_graph({0, 0, 0, 0, 0}, {})};
-  PartitionedGraph p_graph{create_p_graph(graph, 4, {0, 1, 2, 3})};
-
-  graph::SubgraphMemory memory{p_graph};
-  auto result = extract_subgraphs(p_graph, 4, memory);
-
-  EXPECT_EQ(result.subgraphs[0].n(), 1);
-  EXPECT_EQ(result.subgraphs[1].n(), 1);
-  EXPECT_EQ(result.subgraphs[2].n(), 1);
-  EXPECT_EQ(result.subgraphs[3].n(), 1);
-  EXPECT_EQ(result.subgraphs[0].m(), 0);
-  EXPECT_EQ(result.subgraphs[1].m(), 0);
-  EXPECT_EQ(result.subgraphs[2].m(), 0);
-  EXPECT_EQ(result.subgraphs[3].m(), 0);
-}
-
-TEST(SubgraphExtractionTest, ExtractsEdges) {
-  Graph graph{create_graph({0, 1, 2, 3, 4}, {1, 0, 3, 2})};
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 0, 1, 1})};
-
-  graph::SubgraphMemory memory{p_graph};
-  auto result = extract_subgraphs(p_graph, 2, memory);
-
-  EXPECT_EQ(result.subgraphs[0].n(), 2);
-  EXPECT_EQ(result.subgraphs[1].n(), 2);
-  EXPECT_EQ(result.subgraphs[0].m(), 2);
-  EXPECT_EQ(result.subgraphs[1].m(), 2);
-  EXPECT_THAT(result.subgraphs[0].edge_target(0), AnyOf(0, 1));
-  EXPECT_THAT(result.subgraphs[0].edge_target(1), AnyOf(1, 0));
-  EXPECT_NE(result.subgraphs[0].edge_target(0), result.subgraphs[0].edge_target(1));
-  EXPECT_THAT(result.subgraphs[1].edge_target(0), AnyOf(0, 1));
-  EXPECT_THAT(result.subgraphs[1].edge_target(1), AnyOf(1, 0));
-  EXPECT_NE(result.subgraphs[1].edge_target(0), result.subgraphs[0].edge_target(1));
-}
-
-TEST(SubgraphExtractionTest, ExtractsPathCutInTwo) {
-  Graph graph{create_graph({0, 1, 3, 5, 6}, {1, 0, 2, 1, 3, 2})};
-  PartitionedGraph p_graph{create_p_graph(graph, 2, {0, 0, 1, 1})};
-
-  graph::SubgraphMemory memory{p_graph};
-  auto result = extract_subgraphs(p_graph, 2, memory);
-
-  EXPECT_EQ(result.subgraphs[0].n(), 2);
-  EXPECT_EQ(result.subgraphs[1].n(), 2);
-  EXPECT_EQ(result.subgraphs[0].m(), 2);
-  EXPECT_EQ(result.subgraphs[1].m(), 2);
-  EXPECT_THAT(result.subgraphs[0].edge_target(0), AnyOf(0, 1));
-  EXPECT_THAT(result.subgraphs[0].edge_target(1), AnyOf(1, 0));
-  EXPECT_NE(result.subgraphs[0].edge_target(0), result.subgraphs[0].edge_target(1));
-  EXPECT_THAT(result.subgraphs[1].edge_target(0), AnyOf(0, 1));
-  EXPECT_THAT(result.subgraphs[1].edge_target(1), AnyOf(1, 0));
-  EXPECT_NE(result.subgraphs[1].edge_target(0), result.subgraphs[0].edge_target(1));
-}
-
-TEST(SubgraphExtractionTest, ComplexTrianglesWeightedExampleWorks) {
-  // 0 -- 1 -- 2 -- 3
-  //  \  /      \  /
-  //   4 -------- 5
-  //    \        /
-  //     6 ---- 7
-  //      \    /
-  //         8
-  // weights shifted by one
-  // edges weight = sum of incident node weights
-  // each triangle in one block
-  Graph graph{create_graph(
-      {0, 2, 5, 8, 10, 14, 18, 21, 24, 26},
-      {1, 4, 0, 2, 4, 1, 3, 5, 2, 5, 0, 1, 5, 6, 2, 3, 4, 7, 4, 7, 8, 5, 6, 8, 6, 7},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9},
-      {3, 6, 3, 5, 7, 5, 7, 9, 7, 10, 6, 7, 11, 12, 9, 10, 11, 14, 12, 15, 16, 14, 15, 17, 16, 17}
-  )};
-  PartitionedGraph p_graph{create_p_graph(graph, 3, {0, 0, 1, 1, 0, 1, 2, 2, 2})};
-
-  graph::SubgraphMemory memory{
-      p_graph.n(),
-      15,
-      p_graph.m(),
-      p_graph.graph().node_weighted(),
-      p_graph.graph().edge_weighted()};
-  auto result = extract_subgraphs(p_graph, 3, memory);
-
-  EXPECT_EQ(result.subgraphs[0].n(), 3);
-  EXPECT_EQ(result.subgraphs[1].n(), 3);
-  EXPECT_EQ(result.subgraphs[2].n(), 3);
-  EXPECT_EQ(result.subgraphs[0].m(), 6);
-  EXPECT_EQ(result.subgraphs[1].m(), 6);
-  EXPECT_EQ(result.subgraphs[2].m(), 6);
-
-  EXPECT_THAT(result.subgraphs[0], HasEdgeWithWeightedEndpoints(1, 2));
-  EXPECT_THAT(result.subgraphs[0], HasEdgeWithWeightedEndpoints(1, 5));
-  EXPECT_THAT(result.subgraphs[0], HasEdgeWithWeightedEndpoints(2, 5));
-  EXPECT_EQ(result.subgraphs[0].total_edge_weight(), 32);
-
-  EXPECT_THAT(result.subgraphs[1], HasEdgeWithWeightedEndpoints(3, 4));
-  EXPECT_THAT(result.subgraphs[1], HasEdgeWithWeightedEndpoints(3, 6));
-  EXPECT_THAT(result.subgraphs[1], HasEdgeWithWeightedEndpoints(4, 6));
-
-  EXPECT_THAT(result.subgraphs[2], HasEdgeWithWeightedEndpoints(7, 8));
-  EXPECT_THAT(result.subgraphs[2], HasEdgeWithWeightedEndpoints(7, 9));
-  EXPECT_THAT(result.subgraphs[2], HasEdgeWithWeightedEndpoints(8, 9));
-}
-} // namespace kaminpar::shm::testing
diff --git a/tests/shm/test_helpers.h b/tests/shm/test_helpers.h
deleted file mode 100644
index aeae5ac3..00000000
--- a/tests/shm/test_helpers.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <vector>
-
-#include "kaminpar-shm/context.h"
-
-namespace kaminpar::shm::testing {
-template <typename View> auto view_to_vector(const View &&view) {
-  std::vector<std::decay_t<decltype(*view.begin())>> vec;
-  for (const auto &e : view) {
-    vec.push_back(e);
-  }
-  return vec;
-}
-} // namespace kaminpar::shm::testing